예제 #1
0
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/smalldic.txt')

    print('-----------')
    # print vocab
    print('-----------')
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 70
    tar_maxlen = 17
    output_dim = vocab_size
    hidden_dim = 100

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    en_de_model.load_weights('en_de_weights1-40.h5')

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # # input_text = loadfile('./corpus/content-12147.txt')
    # input_text = loadfile('./corpus/content1-500.txt')
    #
    # input_list = []
    # for tmp_input in input_text:
    #     input_list.append(chtokenize(tmp_input))
    #
    # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
    #
    # out_predicts = en_de_model.predict(inputs_train)
    # for i_idx, out_predict in enumerate(out_predicts):
    #     predict_sequence = []
    #     tempstr = ''
    #     for predict_vector in out_predict:
    #         next_index = np.argmax(predict_vector)
    #         next_token = idx_to_word[next_index]
    #         # print next_token
    #         tempstr += next_token
    #         predict_sequence.append(next_token)
    #     print tempstr
    #     # print('Predict output:', predict_sequence)
    #
    # print ('Train Ended')

    # def predict(input_text):
    import socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = socket.gethostbyname(socket.gethostname())
    port = 50008
    sock.bind((host, port))
    sock.listen(5)
    while True:
        conn, addr = sock.accept()
        data = conn.recv(1024)
        list = []
        # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。'
        tmp = 'BEG ' + data + ' END'
        tmp = jiebacut(tmp)
        list.append(tmp)
        result = ''
        input_list = []
        for tmp_input in list:
            print(tmp_input)
            print('---!--!---')
            input_list.append(chtokenize(tmp_input))
        inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            tempstr = ''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                # print next_token
                tempstr += next_token
                predict_sequence.append(next_token)
            print(tempstr)
            result = tempstr

            print('Predict output:', predict_sequence)
        reply = result
        conn.send(reply.encode())
def train(train_path, tokenizer_path):
    print('import data...')
    maxlen = 1024
    X, label, Y = text2sequence(train_path, tokenizer_path, maxlen)
    num_class = len(set(label))
    print('data import finished!')          
    tokenizer = pickle.load(open(tokenizer_path, 'rb'))
    num_words = len(tokenizer.word_index)+1
    print('prepare training data and validation data using k_fold')
    seed = 0
    k = 10
    k_fold = StratifiedKFold(n_splits = k, shuffle = True, random_state = seed)
    #10折交叉验证数据集划分
    
    cw_1 = {0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1} #不考虑数据不均衡
    cw_2 = {0:0.348709, 1:3.457910, 2:1.451396, 3:2.116922,
    		4:17.358700, 5:0.404727, 6:3.370635, 7:1.167362} #每类权重为(1/8/该类出现频率)
    class_weight = [cw_1, cw_2] #使两种权重一样重要
    #在100个文档的数据集上测试发现不使用class_weight的效果比使用class_weight的好
    #使用class_weight的效果比只使用cw_2的效果好

    print('create lstm model...')
    model = Sequential()
    model.add(Embedding(num_words, 300, input_length=maxlen))
    model.add(Dropout(0.5))
    model.add(Convolution1D(128, 3, padding = 'same', strides = 1))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size = 2))
    model.add(LSTM(64, recurrent_dropout=0.5))
    model.add(Dropout(0.5))
    model.add(Dense(num_class, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())    
    
    k_fold_cv_loss = []
    k_fold_cv_acc = []
        
    dt = datetime.now() 
    d = dt.date()
    h = dt.time().hour
    m = dt.time().minute
    time_str = '{}_{}{}'.format(d, h, m)
    
    mckpt = ModelCheckpoint('model/best-cnn_lstm_weights_{}.h5'.format(time_str), monitor = 'val_loss', mode = 'auto', verbose = 1,
    						save_best_only = True, save_weights_only = True, period = 1)
    rlstp = EarlyStopping(monitor = 'val_loss', patience = 3)
    tb= TensorBoard(log_dir='./logs', embeddings_freq=1, write_images = 1,
    							histogram_freq = 1, batch_size = 32)
    turn = 1
    for train, valid in k_fold.split(X,label):
        print('the {} turn training...'.format(turn))
        turn += 1
        model.fit(X[train], Y[train], validation_data = (X[valid], Y[valid]), class_weight = None,
                  callbacks = [mckpt], verbose = 2, epochs=10, batch_size=32)
        # Evaluate model
        loss, acc = model.evaluate(X[valid], Y[valid], verbose=0, batch_size=32)
        k_fold_cv_loss.append(loss)
        k_fold_cv_acc.append(acc)
        
    print("Model loss: {:0.6f}".format(np.mean(k_fold_cv_loss)))
    print("Model Accuracy: {:0.6f}%".format(np.mean(k_fold_cv_acc) * 100))

    # Save model
    model.save_weights('model/cnn_lstm_weights_{}.h5'.format(time_str))
    model.save('model/cnn_lstm_model_{}.h5'.format(time_str))
    with open('model/cnn_lstm_model_{}.json'.format(time_str), 'w') as outfile:
        outfile.write(model.to_json())
예제 #3
0
tokenizer.word_index['no']
sum(answers_test)

#model creation
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

#tupple (max_story_len,batch_size) here batch size not sure so only ,
input_sequence = Input((max_story_len, ))
question = Input((max_question_len, ))

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocabulary_len, output_dim=64))
input_encoder_m.add(Dropout(0.3))
#output: (samples, story_maxlen, embedding_dim)

input_encoder_c = Sequential()
input_encoder_c.add(
    Embedding(input_dim=vocabulary_len, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
#output: (samples, story_maxlen, question_maxlen)

question_encoder = Sequential()
question_encoder.add(
    Embedding(input_dim=vocabulary_len,
              output_dim=64,
              input_length=max_question_len))
question_encoder.add(Dropout(0.3))
예제 #4
0
def computeRnnSkipgram(sourceTexts, targetTexts, irmodel, answers, filename):
    """Read source and target artefacts and built the RNN SKIPGRAM model and compute similarity for each pair of artefacts.
  Args:
    sourceTexts: a list of source artefacts tokenized with stopword removed;
    targetTexts: a list of target artefacts tokenized with stopword removed;
    answers: list of true links;
    irmodel: a statistic model result(LSI,LDA or VSM)
    #TODO align rnn vector without a statistic model result
    filename: file where the ir model result are saved.
  Returns:
    None.
    """
    #load cbow model
    tokenizer, embedding_matrix, sequences, maxlen, num_words, embedding_dim, artifact_pairs = ReadingWordEmb.compute_Skipgram(
        sourceTexts, targetTexts)

    artifacts1 = [x[0] for x in artifact_pairs]
    artifacts2 = [x[1] for x in artifact_pairs]
    sequences_1 = tokenizer.texts_to_sequences(artifacts1)
    sequences_2 = tokenizer.texts_to_sequences(artifacts2)
    #compute the number of common words for each pair of artifacts
    leaks = [[len(set(x1)),
              len(set(x2)),
              len(set(x1).intersection(x2))]
             for x1, x2 in zip(sequences_1, sequences_2)]

    #we padded all our sentences to have the same length
    padded_data_1 = pad_sequences(sequences_1, maxlen=maxlen)
    padded_data_2 = pad_sequences(sequences_2, maxlen=maxlen)
    leaks = np.array(leaks)

    #build the answers vector
    labels = []

    labels = CreateTraining_set.alignTraining_set(answers, irmodel)
    #building training set
    if (os.path.exists("outputs/trainingSet.p")):
        print("training set already created")
        print("evaluation set already created")
    else:
        CreateTraining_set.create_training_set(labels)

    # retrieved training data
    train_indices = pickle.load(open("outputs/trainingSet.p", "rb"))
    train_indices.sort()
    train_data_1_all = np.array(padded_data_1)[train_indices, :]
    train_data_2_all = np.array(padded_data_2)[train_indices, :]
    train_labels_all = np.array(labels)[train_indices, :]
    train_leaks_all = np.array(leaks)[train_indices, :]

    VALIDATION_SPLIT = 0.1
    dev_idx = max(1, int(len(train_labels_all) * VALIDATION_SPLIT))

    #splitting training data and validation data
    train_data_1, val_data_1 = train_data_1_all[:-dev_idx], train_data_1_all[
        -dev_idx:]
    train_data_2, val_data_2 = train_data_2_all[:-dev_idx], train_data_2_all[
        -dev_idx:]
    train_labels, val_labels = train_labels_all[:-dev_idx], train_labels_all[
        -dev_idx:]
    train_leaks, val_leaks = train_leaks_all[:-dev_idx], train_leaks_all[
        -dev_idx:]
    '''  
    # retrieved evaluated data for testing the model
    test_indices=pickle.load(open("outputs/evaluationSet.p","rb"))
    test_data_1=np.array(padded_data_1)[test_indices,:]
    test_data_2=np.array(padded_data_2)[test_indices,:]
    test_labels=np.array(labels)[test_indices,:]
    test_leaks=np.array(leaks)[test_indices,:]
    
    test_data=np.array(artifact_pairs)[test_indices,:]
            
    del padded_data_1
    del padded_data_2
    gc.collect()
    '''
    #building Rnn model with LSTM
    RATE_DROP_LSTM = 0.17
    RATE_DROP_DENSE = 0.25
    NUMBER_LSTM = 50
    NUMBER_DENSE_UNITS = 50
    NUMBER_DENSE_UNITS_1 = 25
    ACTIVATION_FUNCTION = 'relu'

    # Creating word embedding layer
    embedding_layer = Embedding(num_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=False)

    # Creating LSTM Encoder
    lstm_layer = Bidirectional(
        LSTM(NUMBER_LSTM,
             dropout=RATE_DROP_LSTM,
             recurrent_dropout=RATE_DROP_LSTM))

    # Creating LSTM Encoder layer for source artifact
    sequence_1_input = Input(shape=(maxlen, ), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    # Creating LSTM Encoder layer for for target artifact
    sequence_2_input = Input(shape=(maxlen, ), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    x2 = lstm_layer(embedded_sequences_2)

    # Creating leaks input
    leaks_input = Input(shape=(leaks.shape[1], ))
    #leaks_dense = Dense(NUMBER_DENSE_UNITS/2, activation=ACTIVATION_FUNCTION, input_shape=leaks_input.get_shape())
    #leaks_dense = Dense(NUMBER_DENSE_UNITS/2, activation=ACTIVATION_FUNCTION)(leaks_input)
    #print(leaks_dense.input_shape)

    # Merging two LSTM encodes vectors from sentences to
    # pass it to dense layer applying dropout and batch normalisation

    merged = concatenate([x1, x2, leaks_input])
    merged = BatchNormalization()(merged)
    merged = Dropout(RATE_DROP_DENSE)(merged)
    merged = Dense(NUMBER_DENSE_UNITS, activation=ACTIVATION_FUNCTION)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(RATE_DROP_DENSE)(merged)
    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input],
                  outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

    STAMP = 'lstm_%d_%d_%.2f_%.2f' % (NUMBER_LSTM, NUMBER_DENSE_UNITS,
                                      RATE_DROP_LSTM, RATE_DROP_DENSE)
    model_save_directory = 'outputs/'
    checkpoint_dir = model_save_directory + 'checkpoints/' + str(
        int(time.time())) + '/'

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    bst_model_path = checkpoint_dir + STAMP + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path,
                                       save_best_only=True,
                                       save_weights_only=False)

    numExec = 1
    RnnResult = []
    for i in range(0, numExec):

        #tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))

        model.fit([train_data_1, train_data_2, train_leaks],
                  train_labels,
                  validation_data=([val_data_1, val_data_2,
                                    val_leaks], val_labels),
                  epochs=200,
                  batch_size=64,
                  shuffle=True,
                  callbacks=[early_stopping, model_checkpoint])

        preds = list(
            model.predict([padded_data_1, padded_data_2, leaks],
                          verbose=1).ravel())
        RnnResult.append(preds)

        loss, accuracy = model.evaluate([padded_data_1, padded_data_2, leaks],
                                        labels,
                                        verbose=1)
        print('Accuracy: %f' % (accuracy * 100))

    resvect = np.zeros((len(preds), 1))

    for res in RnnResult:
        for i in range(0, len(preds)):
            resvect[i] = resvect[i] + res[i]

    resvect = resvect / numExec

    results = [(x[0], y[0], z) for (x, y), z in zip(artifact_pairs, preds)]

    print("RNN Skipgram model compute")

    #creation of the csv file
    with open(filename, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=("Artifact1", "Artifact2",
                                            "probability"))
        writer.writeheader()
        # in each row # add requirements names, model name , and value
        for res in results:
            writer.writerow({
                'Artifact1': str("{0}".format(res[0])),
                'Artifact2': str("{0}".format(res[1])),
                'probability': str("{0}".format(res[2]))
            })

    print("similarity matrix build")
예제 #5
0
def Mem_Model2(story_maxlen, query_maxlen, vocab_size):
    input_encoder_m = Sequential()
    input_encoder_m.add(
        Embedding(input_dim=vocab_size,
                  output_dim=128,
                  input_length=story_maxlen))
    input_encoder_m.add(Dropout(0.5))
    # output: (samples, story_maxlen, embedding_dim)
    # embed the question into a sequence of vectors
    question_encoder = Sequential()
    question_encoder.add(
        Embedding(input_dim=vocab_size,
                  output_dim=128,
                  input_length=query_maxlen))
    question_encoder.add(Dropout(0.5))
    # output: (samples, query_maxlen, embedding_dim)
    # compute a 'match' between input sequence elements (which are vectors)
    # and the question vector sequence
    match = Sequential()
    match.add(
        Merge([input_encoder_m, question_encoder], mode='dot', dot_axes=[2,
                                                                         2]))
    match.add(Activation('softmax'))

    plot(match, to_file='model_1.png')

    # output: (samples, story_maxlen, query_maxlen)
    # embed the input into a single vector with size = story_maxlen:
    input_encoder_c = Sequential()
    # input_encoder_c.add(Embedding(input_dim=vocab_size,
    #                               output_dim=query_maxlen,
    #                               input_length=story_maxlen))
    input_encoder_c.add(
        Embedding(input_dim=vocab_size,
                  output_dim=query_maxlen,
                  input_length=story_maxlen))
    input_encoder_c.add(Dropout(0.5))
    # output: (samples, story_maxlen, query_maxlen)
    # sum the match vector with the input vector:
    response = Sequential()
    response.add(Merge([match, input_encoder_c], mode='sum'))
    # output: (samples, story_maxlen, query_maxlen)
    response.add(Permute(
        (2, 1)))  # output: (samples, query_maxlen, story_maxlen)

    plot(response, to_file='model_2.png')

    # concatenate the match vector with the question vector,
    # and do logistic regression on top
    answer = Sequential()
    answer.add(
        Merge([response, question_encoder], mode='concat', concat_axis=-1))
    # the original paper uses a matrix multiplication for this reduction step.
    # we choose to use a RNN instead.
    answer.add(LSTM(64))
    # one regularization layer -- more would probably be needed.
    answer.add(Dropout(0.5))
    answer.add(Dense(50))
    # we output a probability distribution over the vocabulary
    answer.add(Activation('sigmoid'))

    return answer
예제 #6
0
vocab_size = len(tokenizer.word_index) + 1
sentences = tokenizer.texts_to_sequences(sentences)
padded_docs = pad_sequences(sentences, maxlen=max_review_len)
#sentences = tokenizer.texts_to_matrix(sentences)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(padded_docs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1000)

# Number of features
# print(input_dim)
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_review_len))
model.add(Flatten())
model.add(layers.Dense(300, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
history = model.fit(X_train,
                    y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=256)

# For accuracy values
plt.plot(history.history['acc'])
예제 #7
0
    print(train_data["encoded_text"][:3])

    # pad documents to a max length of 4 words

    X_train = pad_sequences(train_data["encoded_text"],
                            maxlen=SEQUENCE_LENGTH,
                            padding='post')
    X_eval = pad_sequences(eval_data["encoded_text"],
                           maxlen=SEQUENCE_LENGTH,
                           padding='post')
    print(X_train[:3])

    # define the model
    logger.debug("Model definition")
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, OUTPUT_DIM, input_length=SEQUENCE_LENGTH))
    model.add(Flatten())
    model.add(Dense(3, activation='softmax'))
    # compile the model
    optimizer = Adam(lr=KERAS_LEARNING_RATE, decay=decay)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['acc'])
    # summarize the model
    print(model.summary())
    # fit the model
    # Use Early-Stopping
    callback_early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=KERAS_EARLY_STOPPING,
        verbose=VERBOSE,
def run_model_varyembed(dataset,
                        numhidden,
                        hiddendim,
                        idx2word,
                        idx2label,
                        w2v,
                        basedir,
                        embedding_dim=400,
                        validate=True,
                        num_epochs=30):

    train_toks, valid_toks, test_toks, \
    train_lex, valid_lex, test_lex, \
    train_y, valid_y, test_y = dataset

    maxlen = max([len(l) for l in train_lex])
    if len(valid_lex) > 0:
        maxlen = max(maxlen, max([len(l) for l in valid_lex]))
    maxlen = max(maxlen, max([len(l) for l in test_lex]))

    vocsize = max(idx2word.keys()) + 1
    nclasses = max(idx2label.keys()) + 1

    # Pad inputs to max sequence length and turn into one-hot vectors
    train_lex = sequence.pad_sequences(train_lex, maxlen=maxlen)
    valid_lex = sequence.pad_sequences(valid_lex, maxlen=maxlen)
    test_lex = sequence.pad_sequences(test_lex, maxlen=maxlen)

    train_y = sequence.pad_sequences(train_y, maxlen=maxlen)
    valid_y = sequence.pad_sequences(valid_y, maxlen=maxlen)
    test_y = sequence.pad_sequences(test_y, maxlen=maxlen)

    train_y = vectorize_set(train_y, maxlen, nclasses)
    valid_y = vectorize_set(valid_y, maxlen, nclasses)
    test_y = vectorize_set(test_y, maxlen, nclasses)

    # Build the model
    ## BI-DIRECTIONAL
    print('Building the model...')
    H = numhidden
    model = Graph()

    model.add_input(name='input', input_shape=[maxlen], dtype='int')

    # Add embedding layer
    if w2v is None:
        model.add_node(Embedding(vocsize,
                                 embedding_dim,
                                 init='lecun_uniform',
                                 input_length=maxlen),
                       name='embed',
                       input='input')
    else:
        embeds = init_embedding_weights(idx2word, w2v)
        embed_dim = w2v.syn0norm.shape[1]
        model.add_node(Embedding(vocsize,
                                 embed_dim,
                                 input_length=maxlen,
                                 weights=[embeds],
                                 mask_zero=True),
                       name='embed',
                       input='input')

    # Build first hidden layer
    model.add_node(LSTM(hiddendim, return_sequences=True, activation='tanh'),
                   name='forward0',
                   input='embed')
    model.add_node(Dropout(0.1), name='dropout0f', input='forward0')
    model.add_node(LSTM(hiddendim,
                        return_sequences=True,
                        go_backwards=True,
                        activation='tanh'),
                   name='backwards0',
                   input='embed')
    model.add_node(Dropout(0.1), name='dropout0b', input='backwards0')

    # Build subsequent hidden layers
    if H > 1:
        for i in range(1, H):
            model.add_node(LSTM(hiddendim,
                                return_sequences=True,
                                activation='tanh'),
                           name='forward%d' % i,
                           input='dropout%df' % (i - 1))
            model.add_node(Dropout(0.1),
                           name='dropout%df' % i,
                           input='forward%d' % i)
            model.add_node(LSTM(hiddendim,
                                return_sequences=True,
                                go_backwards=True,
                                activation='tanh'),
                           name='backwards%d' % i,
                           input='dropout%db' % (i - 1))
            model.add_node(Dropout(0.1),
                           name='dropout%db' % i,
                           input='backwards%d' % i)

    # Finish up the network
    model.add_node(TimeDistributedDense(nclasses),
                   name='tdd',
                   inputs=['dropout%df' % (H - 1),
                           'dropout%db' % (H - 1)],
                   merge_mode='ave')
    model.add_node(Activation('softmax'), name='softmax', input='tdd')
    model.add_output(name='output', input='softmax')
    model.compile(optimizer='rmsprop',
                  loss={'output': 'categorical_crossentropy'})

    # Set up callbacks
    fileprefix = 'embed_varied_'
    am = approximateMatch.ApproximateMatch_SEQ(valid_toks,
                                               valid_y,
                                               valid_lex,
                                               idx2label,
                                               pred_dir=os.path.join(
                                                   basedir, 'predictions'),
                                               fileprefix=fileprefix)
    mc = callbacks.ModelCheckpoint(
        os.path.join(basedir, 'models',
                     'embedding.model.weights.{epoch:02d}.hdf5'))
    cbs = [am, mc]
    if validate:
        early_stopping = callbacks.EarlyStopping(monitor='val_loss',
                                                 patience=3)
        cbs.append(early_stopping)

    # Train the model
    print('Training...')
    hist = model.fit({
        'input': train_lex,
        'output': train_y
    },
                     nb_epoch=num_epochs,
                     batch_size=1,
                     validation_data={
                         'input': valid_lex,
                         'output': valid_y
                     },
                     callbacks=cbs)
    if validate:
        val_f1, best_model = learning_curve(
            hist,
            preddir=os.path.join(basedir, 'predictions'),
            pltname=os.path.join(
                basedir, 'charts',
                'hist_varyembed%d_nhidden%d.pdf' % (hiddendim, numhidden)),
            fileprefix=fileprefix)
    else:
        best_model = num_epochs - 1
        val_f1 = 0.0

    # Save model
    json_string = model.to_json()
    open(os.path.join(basedir, 'models', 'embedding_model_architecture.json'),
         'w').write(json_string)

    # Test
    bestmodelfile = os.path.join(
        basedir, 'models', 'embedding.model.weights.%02d.hdf5' % best_model)
    shutil.copyfile(bestmodelfile,
                    bestmodelfile.replace('.hdf5', '.best.hdf5'))
    if validate:
        model = model_from_json(
            open(
                os.path.join(basedir, 'models',
                             'embedding_model_architecture.json')).read())
        model.load_weights(bestmodelfile)

    scores = predict_score(model,
                           test_lex,
                           test_toks,
                           test_y,
                           os.path.join(basedir, 'predictions'),
                           idx2label,
                           maxlen,
                           fileprefix=fileprefix)

    scores['val_f1'] = val_f1

    return scores, hist.history, best_model
예제 #9
0
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 200))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print('Build model...')

# create the model
embedding_vecor_length = 200
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, weights=[embedding_matrix], input_length=traning_len))
model.add(LSTM(100))
#model.add(Flatten())
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_pad, y_train, epochs=10, validation_split=0.1 ,batch_size=128)
# Final evaluation of the model

loss, accuracy = model.evaluate(X_test_pad, y_test)
EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')
print("Accuracy: ", accuracy *100)
    def make_network(
            allele_encoding_dims,
            kmer_size,
            peptide_amino_acid_encoding,
            embedding_input_dim,
            embedding_output_dim,
            allele_dense_layer_sizes,
            peptide_dense_layer_sizes,
            peptide_allele_merge_method,
            peptide_allele_merge_activation,
            layer_sizes,
            dense_layer_l1_regularization,
            dense_layer_l2_regularization,
            activation,
            init,
            output_activation,
            dropout_probability,
            batch_normalization,
            embedding_init_method,
            locally_connected_layers):
        """
        Helper function to make a keras network for class1 affinity prediction.
        """

        # We import keras here to avoid tensorflow debug output, etc. unless we
        # are actually about to use Keras.

        from keras.layers import Input
        import keras.layers
        from keras.layers.core import Dense, Flatten, Dropout
        from keras.layers.embeddings import Embedding
        from keras.layers.normalization import BatchNormalization

        if peptide_amino_acid_encoding == "embedding":
            peptide_input = Input(
                shape=(kmer_size,), dtype='int32', name='peptide')
            current_layer = Embedding(
                input_dim=embedding_input_dim,
                output_dim=embedding_output_dim,
                input_length=kmer_size,
                embeddings_initializer=embedding_init_method,
                name="peptide_embedding")(peptide_input)
        else:
            peptide_input = Input(
                shape=(
                    kmer_size,
                    vector_encoding_length(peptide_amino_acid_encoding)),
                dtype='float32',
                name='peptide')
            current_layer = peptide_input

        inputs = [peptide_input]

        kernel_regularizer = None
        l1 = dense_layer_l1_regularization
        l2 = dense_layer_l2_regularization
        if l1 > 0 or l2 > 0:
            kernel_regularizer = keras.regularizers.l1_l2(l1, l2)

        for (i, locally_connected_params) in enumerate(locally_connected_layers):
            current_layer = keras.layers.LocallyConnected1D(
                name="lc_%d" % i,
                **locally_connected_params)(current_layer)

        current_layer = Flatten(name="flattened_0")(current_layer)

        for (i, layer_size) in enumerate(peptide_dense_layer_sizes):
            current_layer = Dense(
                layer_size,
                name="peptide_dense_%d" % i,
                kernel_regularizer=kernel_regularizer,
                activation=activation)(current_layer)

        if batch_normalization:
            current_layer = BatchNormalization(name="batch_norm_early")(
                current_layer)

        if dropout_probability:
            current_layer = Dropout(dropout_probability, name="dropout_early")(
                current_layer)

        if allele_encoding_dims:
            allele_input = Input(
                shape=allele_encoding_dims,
                dtype='float32',
                name='allele')
            inputs.append(allele_input)
            allele_embedding_layer = Flatten(name="allele_flat")(allele_input)

            for (i, layer_size) in enumerate(allele_dense_layer_sizes):
                allele_embedding_layer = Dense(
                    layer_size,
                    name="allele_dense_%d" % i,
                    kernel_regularizer=kernel_regularizer,
                    activation=activation)(allele_embedding_layer)

            if peptide_allele_merge_method == 'concatenate':
                current_layer = keras.layers.concatenate([
                    current_layer, allele_embedding_layer
                ], name="allele_peptide_merged")
            elif peptide_allele_merge_method == 'multiply':
                current_layer = keras.layers.multiply([
                    current_layer, allele_embedding_layer
                ], name="allele_peptide_merged")
            else:
                raise ValueError(
                    "Unsupported peptide_allele_encoding_merge_method: %s"
                    % peptide_allele_merge_method)

            if peptide_allele_merge_activation:
                current_layer = keras.layers.Activation(
                    peptide_allele_merge_activation,
                    name="alelle_peptide_merged_%s" %
                         peptide_allele_merge_activation)(current_layer)
            
        for (i, layer_size) in enumerate(layer_sizes):
            current_layer = Dense(
                layer_size,
                activation=activation,
                kernel_regularizer=kernel_regularizer,
                name="dense_%d" % i)(current_layer)

            if batch_normalization:
                current_layer = BatchNormalization(name="batch_norm_%d" % i)\
                    (current_layer)

            if dropout_probability > 0:
                current_layer = Dropout(
                    dropout_probability, name="dropout_%d" % i)(current_layer)

        output = Dense(
            1,
            kernel_initializer=init,
            activation=output_activation,
            name="output")(current_layer)
        model = keras.models.Model(
            inputs=inputs,
            outputs=[output],
            name="predictor")
        return model
예제 #11
0
파일: models.py 프로젝트: shivakasu/AES
def create_model(args, initial_mean_value, overal_maxlen, vocab):

    import keras.backend as K
    from keras.layers.embeddings import Embedding
    from keras.models import Sequential, Model
    from keras.layers.core import Dense, Dropout, Activation
    from nea.my_layers import Attention, MeanOverTime, Conv1DWithMasking

    ###############################################################################################################################
    ## Recurrence unit type
    #

    if args.recurrent_unit == 'lstm':
        from keras.layers.recurrent import LSTM as RNN
    elif args.recurrent_unit == 'gru':
        from keras.layers.recurrent import GRU as RNN
    elif args.recurrent_unit == 'simple':
        from keras.layers.recurrent import SimpleRNN as RNN

    ###############################################################################################################################
    ## Create Model
    #

    dropout_W = 0.5  # default=0.5
    dropout_U = 0.1  # default=0.1
    cnn_border_mode = 'same'
    if initial_mean_value.ndim == 0:
        initial_mean_value = np.expand_dims(initial_mean_value, axis=1)
    num_outputs = len(initial_mean_value)

    if args.model_type == 'cls':
        raise NotImplementedError

    elif args.model_type == 'reg':
        logger.info('Building a REGRESSION model')
        model = Sequential()
        model.add(Embedding(args.vocab_size, args.emb_dim, mask_zero=True))
        if args.cnn_dim > 0:
            model.add(
                Conv1DWithMasking(nb_filter=args.cnn_dim,
                                  filter_length=args.cnn_window_size,
                                  border_mode=cnn_border_mode,
                                  subsample_length=1))
        if args.rnn_dim > 0:
            model.add(
                RNN(args.rnn_dim,
                    return_sequences=False,
                    dropout_W=dropout_W,
                    dropout_U=dropout_U))
        if args.dropout_prob > 0:
            model.add(Dropout(args.dropout_prob))
        model.add(Dense(num_outputs))
        if not args.skip_init_bias:
            bias_value = (np.log(initial_mean_value) -
                          np.log(1 - initial_mean_value)).astype(K.floatx())
            model.layers[-1].b.set_value(bias_value)
        model.add(Activation('sigmoid'))
        model.emb_index = 0

    elif args.model_type == 'regp':
        logger.info('Building a REGRESSION model with POOLING')
        model = Sequential()
        model.add(Embedding(args.vocab_size, args.emb_dim, mask_zero=True))
        if args.cnn_dim > 0:
            model.add(
                Conv1DWithMasking(nb_filter=args.cnn_dim,
                                  filter_length=args.cnn_window_size,
                                  border_mode=cnn_border_mode,
                                  subsample_length=1))
        if args.rnn_dim > 0:
            model.add(
                RNN(args.rnn_dim,
                    return_sequences=True,
                    dropout_W=dropout_W,
                    dropout_U=dropout_U))
        if args.dropout_prob > 0:
            model.add(Dropout(args.dropout_prob))
        if args.aggregation == 'mot':
            model.add(MeanOverTime(mask_zero=True))
        elif args.aggregation.startswith('att'):
            model.add(
                Attention(op=args.aggregation,
                          activation='tanh',
                          init_stdev=0.01))
        model.add(Dense(num_outputs))
        if not args.skip_init_bias:
            bias_value = (np.log(initial_mean_value) -
                          np.log(1 - initial_mean_value)).astype(K.floatx())
            model.layers[-1].b.set_value(bias_value)
        model.add(Activation('sigmoid'))
        model.emb_index = 0

    elif args.model_type == 'breg':
        logger.info('Building a BIDIRECTIONAL REGRESSION model')
        from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
        model = Sequential()
        sequence = Input(shape=(overal_maxlen, ), dtype='int32')
        output = Embedding(args.vocab_size, args.emb_dim,
                           mask_zero=True)(sequence)
        if args.cnn_dim > 0:
            output = Conv1DWithMasking(nb_filter=args.cnn_dim,
                                       filter_length=args.cnn_window_size,
                                       border_mode=cnn_border_mode,
                                       subsample_length=1)(output)
        if args.rnn_dim > 0:
            forwards = RNN(args.rnn_dim,
                           return_sequences=False,
                           dropout_W=dropout_W,
                           dropout_U=dropout_U)(output)
            backwards = RNN(args.rnn_dim,
                            return_sequences=False,
                            dropout_W=dropout_W,
                            dropout_U=dropout_U,
                            go_backwards=True)(output)
        if args.dropout_prob > 0:
            forwards = Dropout(args.dropout_prob)(forwards)
            backwards = Dropout(args.dropout_prob)(backwards)
        merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
        densed = Dense(num_outputs)(merged)
        if not args.skip_init_bias:
            raise NotImplementedError
        score = Activation('sigmoid')(densed)
        model = Model(input=sequence, output=score)
        model.emb_index = 1

    elif args.model_type == 'bregp':
        logger.info('Building a BIDIRECTIONAL REGRESSION model with POOLING')
        from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
        model = Sequential()
        sequence = Input(shape=(overal_maxlen, ), dtype='int32')
        output = Embedding(args.vocab_size, args.emb_dim,
                           mask_zero=True)(sequence)
        if args.cnn_dim > 0:
            output = Conv1DWithMasking(nb_filter=args.cnn_dim,
                                       filter_length=args.cnn_window_size,
                                       border_mode=cnn_border_mode,
                                       subsample_length=1)(output)
        if args.rnn_dim > 0:
            forwards = RNN(args.rnn_dim,
                           return_sequences=True,
                           dropout_W=dropout_W,
                           dropout_U=dropout_U)(output)
            backwards = RNN(args.rnn_dim,
                            return_sequences=True,
                            dropout_W=dropout_W,
                            dropout_U=dropout_U,
                            go_backwards=True)(output)
        if args.dropout_prob > 0:
            forwards = Dropout(args.dropout_prob)(forwards)
            backwards = Dropout(args.dropout_prob)(backwards)
        forwards_mean = MeanOverTime(mask_zero=True)(forwards)
        backwards_mean = MeanOverTime(mask_zero=True)(backwards)
        merged = merge([forwards_mean, backwards_mean],
                       mode='concat',
                       concat_axis=-1)
        densed = Dense(num_outputs)(merged)
        if not args.skip_init_bias:
            raise NotImplementedError
        score = Activation('sigmoid')(densed)
        model = Model(input=sequence, output=score)
        model.emb_index = 1

    logger.info('  Done')

    ###############################################################################################################################
    ## Initialize embeddings if requested
    #

    if args.emb_path:
        from w2vEmbReader import W2VEmbReader as EmbReader
        logger.info('Initializing lookup table')
        emb_reader = EmbReader(args.emb_path, emb_dim=args.emb_dim)
        model.layers[model.emb_index].W.set_value(
            emb_reader.get_emb_matrix_given_vocab(
                vocab, model.layers[model.emb_index].W.get_value()))
        logger.info('  Done')

    return model
    def create_model(self):
        global keras_trigger_model

        model_input_dict = dict()
        outputs_to_merge_1 = []

        embedding_layer = Embedding(
            self.word_embeddings.shape[0],
            self.word_embeddings.shape[1],
            weights=[self.word_embeddings],
            trainable=self.hyper_params.train_embeddings,
            name=u'embedding_layer')
        # TODO : why was there a dropout=0.3 in the above Embedding layer?

        window_size = 2 * self.hyper_params.neighbor_distance + 1

        if self.features.sentence_word_embedding:
            sentence_word_embedding_input = Input(
                shape=(self.hyper_params.max_sentence_length, ),
                dtype=u'int32',
                name=u'sentence_word_embedding')
            outputs_to_merge_1.append(
                embedding_layer(sentence_word_embedding_input))
            model_input_dict[
                self.features.
                c_sentence_word_embedding] = sentence_word_embedding_input

        # For each word the pos_array_input defines the distance to the target work.
        # Embed each distance into an 'embedding_vec_length' dimensional vector space
        if self.features.trigger_word_position:
            trigger_word_position_input = Input(
                shape=(self.hyper_params.max_sentence_length, ),
                dtype=u'int32',
                name=u'sentence_word_position')
            outputs_to_merge_1.append(
                Embedding(2 * self.hyper_params.max_sentence_length,
                          self.hyper_params.position_embedding_vector_length)(
                              trigger_word_position_input))
            model_input_dict[
                self.features.
                c_trigger_word_position] = trigger_word_position_input

        # Sentence feature input is the result of mergeing word vectors and embeddings
        if self.features.sentence_ner_type:
            sentence_ner_type_input = Input(
                shape=(self.hyper_params.max_sentence_length, ),
                dtype=u'int32',
                name=u'sentence_entity_type')
            ner_embedding = Embedding(
                self.number_of_entity_bio_types,
                self.hyper_params.entity_embedding_vector_length)(
                    sentence_ner_type_input)
            outputs_to_merge_1.append(ner_embedding)
            model_input_dict[
                self.features.c_sentence_ner_type] = sentence_ner_type_input

        merged = concatenate(outputs_to_merge_1, axis=-1)

        # Note: border_mode='same' to keep output the same width as the input
        maxpools = []
        for filter_length in self.hyper_params.filter_lengths:
            conv = Convolution1D(self.hyper_params.number_of_feature_maps,
                                 filter_length,
                                 border_mode=u'same',
                                 activation='relu')(merged)
            maxpools.append(GlobalMaxPooling1D()(conv))

        outputs_to_merge_2 = []
        outputs_to_merge_2.extend(maxpools)

        if self.features.trigger_window:
            trigger_window_input = Input(shape=(window_size, ),
                                         dtype=u'int32',
                                         name=u'trigger_window')
            lex_words = embedding_layer(trigger_window_input)
            lex_flattened = Flatten()(lex_words)
            outputs_to_merge_2.append(lex_flattened)
            model_input_dict[
                self.features.c_trigger_window] = trigger_window_input

        merged_all = concatenate(
            outputs_to_merge_2
        )  # I used to use: merge(maxpools + [lex_flattened], mode=u'concat')

        # Dense MLP layer with dropout
        dropout = Dropout(self.hyper_params.dropout)(merged_all)
        out = Dense(self.num_output, activation=u'softmax')(dropout)

        model_inputs = [
            model_input_dict[k] for k in self.features.feature_strings
        ]

        keras_trigger_model = Model(inputs=model_inputs, output=[out])
        keras_trigger_model.compile(optimizer=self.optimizer,
                                    loss=u'categorical_crossentropy',
                                    metrics=[])

        self.model = keras_trigger_model