예제 #1
0
파일: lstm.py 프로젝트: mbencherif/Neural
    # learn alphabet from training data
    dataset = \
      dataset.DatasetProvider([cfg.get('data', 'train'),
                               cfg.get('data', 'test')])
    # now load training examples and labels
    train_x, train_y = dataset.load(cfg.get('data', 'train'))
    # now load test examples and labels
    test_x, test_y = dataset.load(cfg.get('data', 'test'))

    init_vectors = None
    # TODO: what what are we doing for index 0 (oov words)?
    # use pre-trained word embeddings?
    if cfg.has_option('data', 'embed'):
        print 'embeddings:', cfg.get('data', 'embed')
        word2vec = word2vec_model.Model(cfg.get('data', 'embed'))
        init_vectors = [word2vec.select_vectors(dataset.word2int)]

    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x + test_x])
    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = pad_sequences(train_y, maxlen=maxlen)
    test_x = pad_sequences(test_x, maxlen=maxlen)
    test_y = pad_sequences(test_y, maxlen=maxlen)

    train_y = np.array([to_categorical(seq, 3) for seq in train_y])
    test_y = np.array([to_categorical(seq, 3) for seq in test_y])

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape
    print 'test_x shape:', test_x.shape
예제 #2
0
  # learn alphabet from training examples
  dataset = dataset.DatasetProvider(train_file)
  # now load training examples and labels
  train_x, train_y = dataset.load(train_file)
  maxlen = max([len(seq) for seq in train_x])
  # now load test examples and labels
  test_x, test_y = dataset.load(test_file, maxlen=maxlen)

  init_vectors = None
  # TODO: what what are we doing for index 0 (oov words)?
  # use pre-trained word embeddings?
  if cfg.has_option('data', 'embed'):
    print 'embeddings:', cfg.get('data', 'embed')
    embed_file = os.path.join(base, cfg.has_option('data', 'embed'))
    word2vec = word2vec_model.Model(embed_file)
    init_vectors = [word2vec.select_vectors(dataset.word2int)]

  # turn x and y into numpy array among other things
  classes = len(set(train_y))
  train_x = pad_sequences(train_x, maxlen=maxlen)
  train_y = to_categorical(np.array(train_y), classes)
  test_x = pad_sequences(test_x, maxlen=maxlen)
  test_y = to_categorical(np.array(test_y), classes)

  print 'train_x shape:', train_x.shape
  print 'train_y shape:', train_y.shape
  print 'test_x shape:', test_x.shape
  print 'test_y shape:', test_y.shape, '\n'

  branches = [] # models to be merged
예제 #3
0
파일: cnn_dense.py 프로젝트: afcarl/Python
BATCH = 50
EPOCHS = 5
CLASSES = 2
EMBDIMS = 300
MAXLEN = 55
MAXFEATURES = 18000
FILTERS = 100
FILTLEN = 4

if __name__ == "__main__":

  dataset = dataset.DatasetProvider(MAXFEATURES)
  x, y = dataset.load_data()

  path = '/Users/Dima/Loyola/Data/Word2Vec/Models/GoogleNews-vectors-negative300.txt'
  word2vec = word2vec_model.Model(path)
  embedding_lookup = word2vec.select_vectors(dataset.alphabet)

  # turn x and y into numpy arrays among other things
  x = sequence.pad_sequences(x, maxlen=MAXLEN)
  y = k.utils.np_utils.to_categorical(np.array(y), CLASSES)

  x3d = np.zeros((10662, 55, 300))
  for row in range(10662):
    for col in range(55):
      word = x[row, col]
      x3d[row, col, :] = embedding_lookup[word]
  
  scores = []
  folds = sk.cross_validation.KFold(len(y),
                                    n_folds=NFOLDS,