Python load_data_with_word2vec示例，data_helpers.load_data_with_word2vec Python示例

示例#1

0

显示文件

文件： text_cnn.py 项目： 4ker/mxnet

def main():
    print('Loading data...')
    # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin')
    word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
    x, y = data_helpers.load_data_with_word2vec(word2vec)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('dev shape:', x_dev.shape)

    # reshpae for convolution input
    x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
    x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2]))

    num_embed = x_train.shape[-1]
    sentence_size = x_train.shape[2]
    print('sentence max words', sentence_size)
    print('embedding size', num_embed)
    batch_size = 50

    cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5)
    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)

示例#2

0

显示文件

def main():
    print 'Loading data...'
    # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin')
    word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
    x, y = data_helpers.load_data_with_word2vec(word2vec)


    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
    print 'train shape:', x_train.shape
    print 'dev shape:', x_dev.shape

    # reshpae for convolution input
    x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
    x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2]))

    num_embed = x_train.shape[-1]
    sentence_size = x_train.shape[2]
    print 'sentence max words', sentence_size
    print 'embedding size', num_embed
    batch_size = 50

    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5)
    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)

示例#3

0

显示文件

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    logger.info('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshpae for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embed_size = x.shape[-1]
        sentence_size = x.shape[2]
        vocab_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embed_size = num_embed
        sentence_size = x.shape[1]
        vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    logger.info('train shape: %(shape)s', {'shape': x_train.shape})
    logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
    logger.info('embedding size: %(msg)s', {'msg': embed_size})
    logger.info('vocab size: %(msg)s', {'msg': vocab_size})

    train = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(x_dev, y_dev, batch_size)
    return (train, valid, sentence_size, embed_size, vocab_size)

示例#4

0

显示文件

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    """Construct data iter

    Parameters
    ----------
    batch_size: int
    num_embed: int
    pre_trained_word2vec: boolean
                        identify the pre-trained layers or not
    Returns
    ----------
    train_set: DataIter
                Train DataIter
    valid: DataIter
                Valid DataIter
    sentences_size: int
                array dimensions
    embedded_size: int
                array dimensions
    vocab_size: int
                array dimensions
    """
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshape for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embedded_size = x.shape[-1]
        sentences_size = x.shape[2]
        vocabulary_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embedded_size = num_embed
        sentences_size = x.shape[1]
        vocabulary_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentences_size)
    print('embedding size', embedded_size)
    print('vocab size', vocabulary_size)

    train_set = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)

    return train_set, valid, sentences_size, embedded_size, vocabulary_size

示例#5

0

显示文件

文件： text_cnn.py 项目： Piyush3dB/mxnet

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshpae for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embed_size = x.shape[-1]
        sentence_size = x.shape[2]
        vocab_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embed_size = num_embed
        sentence_size = x.shape[1]
        vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentence_size)
    print('embedding size', embed_size)
    print('vocab size', vocab_size)

    train = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)
    
    return (train, valid, sentence_size, embed_size, vocab_size)