Пример #1
0
def train(src,
          dest,
          pivot_num,
          pivot_min_st,
          dim,
          pivot_method='mi',
          pivots=None):
    outputs = pivot_num
    HUs = dim
    #get the representation learning training and vlidation data
    x, y, x_valid, y_valid, inputs = pre.preproc(pivot_num,
                                                 pivot_min_st,
                                                 src,
                                                 dest,
                                                 pivot_method=pivot_method,
                                                 pivots=pivots)

    model = Sequential()
    model.add(
        Dense(HUs, kernel_initializer='glorot_normal', input_shape=(inputs, )))
    #model.add(Dense(HUs, input_shape=(inputs,)))
    model.add(Activation('sigmoid'))
    model.add(Dense(outputs))
    model.add(Activation('sigmoid'))
    print(model.summary())
    #     opt = SGD(lr=0.1, decay=1e-5, momentum=0.9)
    opt = Adam()

    model.compile(optimizer=opt, loss='binary_crossentropy')

    #stops as soon as the validaion loss stops decreasing
    earlyStopping = EarlyStopping(monitor='val_loss', patience=0, mode='min')
    #saveing only the best model
    save_best = ModelCheckpoint("best_model",
                                monitor='val_loss',
                                verbose=0,
                                save_best_only=True,
                                mode='auto')

    h = model.fit(x,
                  y,
                  batch_size=50,
                  callbacks=[earlyStopping, save_best],
                  epochs=40,
                  validation_data=(x_valid, y_valid),
                  shuffle=True)
    print((h.history['val_loss'])[-1])
    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(pivot_min_st) + "_" + str(HUs)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    #saving the entire model
    model = load_model("best_model")
    np.save(weight_str, model.get_weights())
    del model
    gc.collect()
Пример #2
0
def train(src, dest, pivot_num, pivot_min_st, dim):
    outputs = pivot_num
    HUs = dim
    #get the representation learning training and vlidation data
    x, y, x_valid, y_valid, inputs = pre.preproc(pivot_num, pivot_min_st, src,
                                                 dest)
    #train word2vec for the pivots embeddings
    w2v.wo2ve(src, dest, pivot_num, pivot_min_st, HUs)
    filename = src + "_to_" + dest + "/" + "pivot_mat/" + "pivot_mat_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(pivot_min_st) + "_" + str(dim)
    mat = np.load(filename)

    model = Sequential()
    #make sure the pivot matrix (the decoder) is fixed
    frozen_layer = Dense(outputs, trainable=False)
    model.add(
        Dense(HUs, kernel_initializer='glorot_normal', input_shape=(inputs, )))
    model.add(Activation('sigmoid'))
    model.add(frozen_layer)
    model.add(Activation('sigmoid'))
    print(model.summary())
    w = model.get_weights()
    #initialzing the decoder with the pivot embeddings
    w[2] = mat.transpose()
    sgd = SGD(lr=0.1, decay=1e-5, momentum=0.9)
    model.set_weights(w)

    model.compile(optimizer=sgd, loss='binary_crossentropy')

    #stops as soon as the validaion loss stops decreasing
    earlyStopping = EarlyStopping(monitor='val_loss', patience=0, mode='min')
    #saveing only the best model
    save_best = ModelCheckpoint("best_model",
                                monitor='val_loss',
                                verbose=0,
                                save_best_only=True,
                                mode='auto')

    h = model.fit(x,
                  y,
                  batch_size=1,
                  callbacks=[earlyStopping, save_best],
                  epochs=10,
                  validation_data=(x_valid, y_valid),
                  shuffle=True)
    print((h.history['val_loss'])[-1])
    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(pivot_min_st) + "_" + str(HUs)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    #saving the entire model
    model = load_model("best_model")
    np.save(weight_str, model.get_weights())
Пример #3
0
def train(src, dest, pivot_num, pivot_min_st, dim):
    outputs = pivot_num
    HUs = dim
    x, y, x_valid, y_valid, inputs = pre.preproc(pivot_num, pivot_min_st, src,
                                                 dest)
    w2v.wo2ve(src, dest, pivot_num, pivot_min_st, HUs)
    filename = src + "_to_" + dest + "/" + "pivot_mat/" + "pivot_mat_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(pivot_min_st) + "_" + str(dim)
    mat = np.load(filename)

    #mat= np.load("kw.npy")
    #print "the shape is ",mat[0].shape

    model = Sequential()
    frozen_layer = Dense(outputs, trainable=False)
    model.add(Dense(HUs, input_shape=(inputs, ), init='glorot_normal'))
    #model.add(Dense(HUs, input_shape=(inputs,)))
    model.add(Activation('sigmoid'))
    model.add(frozen_layer)
    model.add(Activation('sigmoid'))
    print(model.summary())
    w = model.get_weights()
    w[2] = mat.transpose()
    sgd = SGD(lr=0.1, decay=1e-5, momentum=0.9)
    #sgd = SGD(lr=10, momentum=0.9)
    model.set_weights(w)

    model.compile(optimizer=sgd, loss='binary_crossentropy')

    earlyStopping = EarlyStopping(monitor='val_loss', patience=0, mode='min')
    save_best = ModelCheckpoint("best_model",
                                monitor='val_loss',
                                verbose=0,
                                save_best_only=True,
                                mode='auto')

    h = model.fit(x,
                  y,
                  batch_size=10,
                  callbacks=[earlyStopping],
                  nb_epoch=50,
                  validation_data=(x_valid, y_valid),
                  shuffle=True)
    print((h.history['val_loss'])[-1])
    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(pivot_min_st) + "_" + str(HUs)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    np.save(weight_str, model.get_weights())
Пример #4
0
def train(src, dest, pivot_num, pivot_min_st, word_vector_size, topWords,
          max_review_len, hidden_units_num, iter_num, criteria):
    if (criteria == "BasicTRL"):
        assert (iter_num == 2)
    names, source_count, target_count = pre.preproc(pivot_num, pivot_min_st,
                                                    src, dest)
    filename = src + "_to_" + dest + "/pivots/" + str(pivot_num) + "_meta"
    with open(filename, 'rb') as f:
        meta = pickle.load(f)
    namesByCriteria = SortByCriteria(criteria, meta, names, source_count,
                                     target_count)

    for i in range(iter_num):
        train_PBLM(src, dest, pivot_num, pivot_min_st, word_vector_size,
                   topWords, max_review_len, hidden_units_num, namesByCriteria,
                   i, iter_num, criteria)
Пример #5
0
def train(src, dest, pivot_num, pivot_min_st):

    x, y, x_valid, y_valid, inputs = pre.preproc(pivot_num, pivot_min_st, src,
                                                 dest)
    print("training......")
    pivot_mat = np.zeros((pivot_num, inputs))
    print("0")
    for i in range(pivot_num):
        print(i)
        clf = linear_model.SGDClassifier(loss="modified_huber")
        clf.fit(x, y[:, i])
        pivot_mat[i] = clf.coef_
    print("finish traning")
    pivot_mat = pivot_mat.transpose()
    svd50 = TruncatedSVD(n_components=50)
    pivot_mat50 = svd50.fit_transform(pivot_mat)
    svd100 = TruncatedSVD(n_components=100)
    pivot_mat100 = svd100.fit_transform(pivot_mat)
    svd150 = TruncatedSVD(n_components=150)
    pivot_mat150 = svd150.fit_transform(pivot_mat)
    print("finished svd")

    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        50)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    np.save(weight_str, pivot_mat50)

    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        100)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    np.save(weight_str, pivot_mat100)

    weight_str = src + "_to_" + dest + "/weights/w_" + src + "_" + dest + "_" + str(
        150)
    filename = weight_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))

    np.save(weight_str, pivot_mat150)
Пример #6
0
def train_PBLM(src, dest, pivot_num, pivot_min_st, word_vector_size, topWords,
               max_review_len, hidden_units_num):
    names = pre.preproc(pivot_num, pivot_min_st, src, dest)
    split_dir = src + "_to_" + dest
    # gets all the train sentiment classification
    with open(split_dir + "/split/train", 'rb') as f:
        train = pickle.load(f)

    unlabeled, source, target = pre.XML2arrayRAW(
        "data/" + src + "/" + src + "UN.txt",
        "data/" + dest + "/" + dest + "UN.txt")
    unlabeled = getClearList(unlabeled)
    train = getClearList(train)
    source_valid = len(source) / 5
    target_valid = len(target) / 5
    tok = Tokenizer(num_words=topWords, split=" ")
    tok.fit_on_texts(train + unlabeled)
    x_valid = unlabeled[:source_valid] + unlabeled[-target_valid:]
    x = unlabeled[source_valid:-target_valid] + train

    #you can reload the pivots if you want to avoid the pivot extraction
    '''
    filename =src + "_to_" + dest + "/pivots/"+str(pivot_num)

    with open(filename, 'rb') as f:
        names = pickle.load(f)
    '''
    Pdict = fill_pivot_dict(names)
    X_train = tok.texts_to_sequences(x)
    X_test = tok.texts_to_sequences(x_valid)

    #creates the model
    embedding_vecor_length = word_vector_size
    model = Sequential()

    model.add(
        Embedding(topWords,
                  embedding_vecor_length,
                  mask_zero=True,
                  embeddings_initializer="glorot_uniform",
                  input_length=max_review_len))
    model.add(LSTM(hidden_units_num, return_sequences=True))
    num_class = pivot_num + 2
    model.add(TimeDistributed(Dense(num_class, activation='softmax')))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'],
                  sample_weight_mode="temporal")
    model_str = src + "_to_" + dest + "/models/model_" + src + "_" + dest + "_" + str(
        pivot_num) + "_" + str(hidden_units_num) + "_" + str(
            word_vector_size) + "_" + ".model"
    print(model.summary())
    if not os.path.exists(os.path.dirname(model_str)):
        os.makedirs(os.path.dirname(model_str))
    #saves only the best model with respect to the validaion loss
    modelCheckpoint = ModelCheckpoint(model_str,
                                      monitor='val_loss',
                                      verbose=0,
                                      save_best_only=True,
                                      save_weights_only=False,
                                      mode='min',
                                      period=1)
    #stops the training if the validation loss has not decreased during the last 2 epochs
    earlyStopping = EarlyStopping(monitor='val_loss', patience=2, mode='min')
    model.fit_generator(generator(X_train, 16, names, Pdict, tok,
                                  max_review_len, pivot_num),
                        steps_per_epoch=(len(X_train) / 16),
                        epochs=10,
                        validation_data=generator_val(X_test, 16, names, Pdict,
                                                      tok, max_review_len,
                                                      pivot_num),
                        validation_steps=(len(X_test) / 16),
                        callbacks=[earlyStopping, modelCheckpoint])