def train_pair(args, train_csv, test_csv):
    print('Reading word vectors.')
    embeddings_index = read_glove_vectors(args.embedding_file_path)
    print('Found {} word vectors.'.format(len(embeddings_index)))

    print('Processing input data')
    x_train, y_train, x_test, y_test, word_index, = read_input_csv(train_csv,
                                                                   test_csv,
                                                                   args.nb_words,
                                                                   args.max_sequence_len)
    print('train tensor {}.'.format(x_train.shape))

    print('Preparing embedding matrix.')
    # initiate embedding matrix with zero vectors.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    # args.len_labels_index = len(labels_index)
    args.len_labels_index = 2  # fixed for sentiment detection.

    model = model_selector(args, embedding_matrix)

    checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
    checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
                                 verbose=1, save_best_only=True)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    tsb = TensorBoard(log_dir='./log', histogram_freq=0, write_graph=True, write_images=False)

    callbacks_list = [checkpoint, earlystop, tsb]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
        json_file.write(model_json)

    model.fit(x_train, y_train, validation_split=0.1,
              nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list)
    classes = earlystop.model.predict_classes(x_test, batch_size=args.batch_size)
    # acc only supports classes
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(y_test))
    print('Test accuracy: {}.'.format(acc))
Пример #2
0
def train(args):
    print('Reading word vectors.')
    embeddings_index = read_glove_vectors(args.embedding_file_path)
    print('Found {} word vectors.'.format(len(embeddings_index)))

    print('Processing input data')
    texts, labels = read_input_data(args.data_dir)
    # texts - list of text samples
    # labels_index - dictionary mapping label name to numeric id
    # labels - list of label ids
    print('Found {} texts.'.format(len(texts)))

    # Vectorize the text sample into 2D integer tensor
    tokenizer = Tokenizer(nb_words=args.nb_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    # Transform labels to be categorical variables

    print('Found {} unique tokens.'.format(len(word_index)))

    data = pad_sequences(sequences, maxlen=args.max_sequence_len)
    x_train, y_train = train_data(data, labels)
    print(type(data))
    print(x_train[100])
    x_train = np.array(x_train).astype('int32')
    print(x_train[100])

    # Transform labels to be categorical variables
    labels = to_categorical(np.asarray(labels))
    y_train = to_categorical(np.asarray(y_train))
    print('Shape of total data tensor:', data.shape)
    print('Shape of total label tensor:', labels.shape)

    # split the input data into training set and validation set
    indices = np.arange(x_train.shape[0])
    np.random.shuffle(indices)
    x_train = x_train[indices]
    y_train = y_train[indices]

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    x_val = data[indices]
    y_val = labels[indices]

    print('Preparing embedding matrix.')

    # initiate embedding matrix with zero vectors.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    args.len_labels_index = 3

    model = model_selector(args, embedding_matrix)

    checkpoint_filepath = os.path.join(args.model_dir, "new.en.msd.weights.best.hdf5")
    checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True)
    callbacks_list = [checkpoint]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "new.en.msd.model.json"), "w") as json_file:
        json_file.write(model_json)

    model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list, verbose=1)
    proba = model.predict_proba(data, batch_size=300)
    np.savetxt('new_en_msd', proba, delimiter='\t', fmt='%.6f')
Пример #3
0
def train(args):
    print('Reading word vectors.')
    #embeddings_index = read_glove_vectors(args.embedding_file_path)

    embeddings_index = read_glove_vectors(
        "/home/quan/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt"
    )
    #embeddings_index = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/glove.txt")
    print('Found {} word vectors in emdedding1.'.format(len(embeddings_index)))
    #print('Found {} word vectors in embedding2.'.format(len(embeddings_index2)))

    print('Processing input data')

    # input_name = ["input_CR_prccd.txt", "input_Sub_prccd.txt", "input_MPQA_prccd.txt", "inputPCQM_prccd.txt",
    #               "input_flood_phi_prccd.txt", "input_flood_colorado_prccd.txt", "input_flood_qeen_prccd.txt",
    #               "input_flood_manila_prccd.txt", "input_fire_australia_prccd.txt", "input_earthquake_chile_prccd.txt"]
    # label_name = ["label_CR.txt", "label_input_Sub.txt", "label_MPQA.txt", "labelPCQM.txt", "label_flood_phi.txt",
    #               "label_flood_colorado.txt", "label_flood_qeen.txt", "label_flood_manila.txt",
    #               "label_fire_australia.txt", "label_earthquake_chile.txt"]
    input_name = ["input_Nepal2.txt"]
    label_name = ["label_Nepal2.txt"]
    with open("30JanMulti_Nepal_Train2_W2V_nonstatic.txt", 'wb') as result_CV:
        for list in range(0, len(input_name)):
            texts, labels_index, labels = read_input_data(
                args.data_dir, input_name[list], label_name[list])
            # texts - list of text samples
            # labels_index - dictionary mapping label name to numeric id
            # labels - list of label ids
            print('Found {} texts.'.format(len(texts)))

            # Vectorize the text sample into 2D integer tensor
            tokenizer = Tokenizer(nb_words=args.nb_words)
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            word_index = tokenizer.word_index
            print('Found {} unique tokens.'.format(len(word_index)))

            data = pad_sequences(sequences, maxlen=args.max_sequence_len)

            # Transform labels to be categorical variables
            labels = to_categorical(np.asarray(labels))
            print('Shape of data tensor:', data.shape)
            print('Shape of label tensor:', labels.shape)

            # split the input data into training set and validation set
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)
            data = data[indices]
            labels = labels[indices]
            # nb_validation_samples = int(args.validation_split * data.shape[0])

            # x_train = data[:-nb_validation_samples]
            # y_train = labels[:-nb_validation_samples]
            # x_val = data[-nb_validation_samples:]
            # y_val = labels[-nb_validation_samples:]

            print('Preparing embedding matrix.')

            # initiate embedding matrix with zero vectors for embedding1.
            nb_words = min(args.nb_words, len(word_index))
            embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
            for word, i in word_index.items():
                if i > nb_words:
                    continue
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            args.nb_words = nb_words
            args.len_labels_index = len(labels_index)
            '''Remember uncomment model according to model.fit below'''
            model = model_selector(args, embedding_matrix)
            #model = model_selector2(args, embedding_matrix, embedding_matrix2)
            #model = model_selector3(args, embedding_matrix)
            #
            cv_scores = []
            ROC_scores = []
            fold = 10

            for i in range(0, fold):
                print("\n")
                print("\n")
                print("\n")
                print("-------------FOLD :", (i + 1))
                window_data = data.shape[0] / fold
                #   Generate batches from indices
                x_train1 = data[:i * window_data]
                x_train2 = data[(i + 1) * window_data:]

                y_train1 = labels[:i * window_data]
                y_train2 = labels[(i + 1) * window_data:]

                if i == 0:
                    x_train = x_train2
                    y_train = y_train2
                else:
                    x_train = np.concatenate((x_train1, x_train2), axis=0)
                    y_train = np.concatenate((y_train1, y_train2), axis=0)

                x_val = data[i * window_data:(i + 1) * window_data]
                y_val = labels[i * window_data:(i + 1) * window_data]

                #   Clear model and create
                model = None
                model = model_selector(args, embedding_matrix)

                # checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
                # earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
                # checkpointer = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True)
                # callbacks_list = [earlystopper, checkpointer]
                # model_json = model.to_json()
                # with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
                #     json_file.write(model_json)
                #
                model.fit(x_train,
                          y_train,
                          epochs=30,
                          batch_size=32,
                          verbose=0)
                # model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs,
                #           batch_size=args.batch_size, callbacks=callbacks_list)
                y_prob = model.predict(x_val)

                roc = metrics.roc_auc_score(y_val, y_prob)
                print("ROC Prediction (binary classification):", roc)
                scores = model.evaluate(x_val, y_val, verbose=0)
                print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
                cv_scores.append(scores[1] * 100)
                ROC_scores.append(roc * 100)

            print(input_name[list])
            print("ACC: %.2f%% (+/- %.2f%%)" %
                  (np.mean(cv_scores), np.std(cv_scores)))
            print("ROC: %.2f%% (+/- %.2f%%)" %
                  (np.mean(ROC_scores), np.std(ROC_scores)))
            result_CV.write(input_name[list] + " ACC: %.2f%% (+/- %.2f%%)" %
                            (np.mean(cv_scores), np.std(cv_scores)) +
                            " ROC: %.2f%% (+/- %.2f%%)" %
                            (np.mean(ROC_scores), np.std(ROC_scores)) + '\n')
            result_CV.write(time.asctime(time.localtime(time.time())) + '\n')
Пример #4
0
def train(args):
    print('Reading word vectors.')
    #embeddings_index = read_glove_vectors(args.embedding_file_path)

    embeddings_index = read_glove_vectors(
        "/home/duong/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt"
    )
    #embeddings_index2 = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/glove2.txt")
    print('Found {} word vectors in emdedding1.'.format(len(embeddings_index)))
    #print('Found {} word vectors in embedding2.'.format(len(embeddings_index2)))

    print('Processing input data')
    texts, labels_index, labels = read_input_data(args.data_dir)
    # texts - list of text samples
    # labels_index - dictionary mapping label name to numeric id
    # labels - list of label ids
    print('Found {} texts.'.format(len(texts)))

    # Vectorize the text sample into 2D integer tensor
    tokenizer = Tokenizer(nb_words=args.nb_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found {} unique tokens.'.format(len(word_index)))

    data = pad_sequences(sequences, maxlen=args.max_sequence_len)

    # Transform labels to be categorical variables
    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # split the input data into training set and validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(args.validation_split * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    print('Preparing embedding matrix.')

    # initiate embedding matrix with zero vectors for embedding1.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    args.len_labels_index = len(labels_index)
    '''Remember uncomment model according to model.fit below'''
    model = model_selector(args, embedding_matrix)
    #model = model_selector2(args, embedding_matrix, embedding_matrix2)
    #model = model_selector3(args, embedding_matrix)
    #
    checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
    # checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
    #                              verbose=1, save_best_only=True)
    # callbacks_list = [checkpoint]
    earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    checkpointer = ModelCheckpoint(checkpoint_filepath,
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True)
    callbacks_list = [earlystopper, checkpointer]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
        json_file.write(model_json)

    model.fit(x_train,
              y_train,
              validation_data=(x_val, y_val),
              nb_epoch=args.num_epochs,
              batch_size=args.batch_size,
              callbacks=callbacks_list)
    #model.fit(x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), nb_epoch=args.num_epochs,
    #          batch_size=args.batch_size, callbacks=callbacks_list)
    #model.fit([x_train, x_train, x_train], y_train, validation_data=([x_val, x_val, x_val], y_val), nb_epoch=args.num_epochs,
    #          batch_size=args.batch_size, callbacks=callbacks_list)
    print("Test model ...")
    print("Loading ...", checkpoint_filepath)
    model.load_weights(checkpoint_filepath)
    y_prob = model.predict(x_val)
    roc = metrics.roc_auc_score(y_val, y_prob)
    print("ROC Prediction (binary classification):", roc)
Пример #5
0
def run(args):
    # Data path
    path = args.data_dir
    #path = "data/"
    X_train = os.path.join(path, 'X.train')
    Y_train = os.path.join(path, 'Y.train')
    X_online_test = os.path.join(path, 'X.test')
    all_text_path = os.path.join(path, 'all_text')
    id_test = os.path.join(path, 'id.test')

    # Seed
    seed = 13
    # fix random seed for reproducibility
    np.random.seed(seed)
    #print("Reading all text...")
    all_texts = open(all_text_path).readlines()
    #print("Tokenizing...")
    tokenizer = Tokenizer(num_words=args.nb_words)
    #print("Fitting...")
    tokenizer.fit_on_texts(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    print("Loading training data...")
    X_train, y_train = ld.load_data(shulf=True,
                                    X_train=X_train,
                                    Y_train=Y_train,
                                    tokenizer=tokenizer,
                                    max_len=args.max_sequence_len)

    # Select Model
    model = model_selector(args, word_index)
    print(model.summary())
    # Callback list
    callbacks = []
    #filepath = "weights-improvement-{epoch:02d}-{acc:.2f}.hdf5"
    #check_point = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    if args.early_stop == 1:
        eraly_stop = keras.callbacks.EarlyStopping(monitor='val_acc',
                                                   min_delta=0,
                                                   patience=0,
                                                   verbose=0,
                                                   mode='auto')
        callbacks.append(eraly_stop)

    print("Training...")
    r = model.fit(X_train,
                  y_train,
                  epochs=args.num_epochs,
                  batch_size=args.batch_size,
                  callbacks=callbacks,
                  validation_split=args.validation_split)
    # Find out the best validation accuracy
    best_val_acc = max(r.history['val_acc'])
    if (best_val_acc < 0.69):
        print("Low Val_Acc.")
    # load X.validate
    X_validate, y_validate = ld.load_data(X_train=X_online_test,
                                          Y_train=None,
                                          tokenizer=tokenizer,
                                          max_len=args.max_sequence_len)
    # predict
    y_validate = model.predict(X_validate, verbose=0)
    # Convert from Categorical to numberical
    y_validate = np.argmax(y_validate, axis=1)
    # Compare with the baseline
    count_diff = 0
    count_same = 0
    if args.baseline:
        with open(args.baseline) as f:
            lines = f.readlines()
            for i in range(len(lines)):
                if y_validate[i] == int(lines[i].split('\t')[1]):
                    count_same += 1
                else:
                    count_diff += 1
    print("Same:%d" % count_same)
    # Load validation Ids
    ids = np.loadtxt(id_test, dtype=bytes).astype(str)
    assert (len(ids) == len(y_validate))
    # Generate result
    result = [[
        ids[i] + "\t" + str(y_validate[i]) + "\t" + "NULL" + "\t" + "NULL"
    ] for i in range(len(y_validate))]
    # Time
    ts = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    # Format output
    output = ts + (
        "_acc_%.2f_%s_%s_%d_%d_%d_%d_%d_%d" %
        (best_val_acc * 100, args.model_name, path.replace("/", "-"),
         args.nb_words, args.max_sequence_len, args.embedding_dim,
         args.batch_size, count_same, args.use_word_embedding))
    np.savetxt("result_" + output + ".txt", result, fmt='%s')
    # Done
    print("Done!")