예제 #1
0
 def MakeCharVocabMat(self, word_vocab, char_vocab):
   graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in word_vocab.GetWords()]
   self.max_len = max([len(x) for x in graphemes])
   grapheme_ids = []
   lengths = []
   for g in graphemes:
     ids = [char_vocab[c] for c in g]
     lengths.append(len(ids))
     if len(ids) < self.max_len:
       ids += [char_vocab['}']] * (self.max_len - len(ids))
     grapheme_ids.append(ids)
     
   self.word_lens = tf.Variable(trainable=False, initial_value=lengths, name='word_lens')
   self.words_as_chars = tf.Variable(trainable=False, initial_value=grapheme_ids, 
                                     name='words_as_chars')
예제 #2
0
파일: rnnlm.py 프로젝트: frankfan007/calm
    dataset.ReadData(args.data,
                     params.context_vars + ['text'],
                     splitter=params.splitter,
                     valdata=args.valdata,
                     types=params.context_var_types)

if args.mode == 'train':
    # do the word vocab
    if args.vocab is not None:
        vocab = Vocab.Load(args.vocab)
    else:
        vocab = Vocab.MakeFromData(dataset.GetColumn('text'),
                                   min_count=params.min_vocab_count)

    if params.splitter == 'word':  # do the character vocab
        graphemes = [['{'] + Vocab.Graphemes(x) + ['}']
                     for x in vocab.GetWords()]
        char_vocab = Vocab.MakeFromData(graphemes, min_count=1)
        char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle'))
    else:
        char_vocab = None

    context_vocabs = {}  # do the context vocabs
    for i, context_var in enumerate(params.context_vars):
        # skip numerical vocabularies
        if hasattr(params, 'context_var_types'
                   ) and params.context_var_types[i] == 'numerical':
            context_vocabs[context_var] = None
            continue

        v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
예제 #3
0
파일: rnnlm.py 프로젝트: robspringles/calm
                    preshuffle=args.mode=='train',
                    batch_size=params.batch_size)
  print 'reading data'
  dataset.ReadData(args.data, params.context_vars + ['text'],
                   splitter=params.splitter,
                   valdata=args.valdata, types=params.context_var_types)

if args.mode == 'train':
  # do the word vocab
  if args.vocab is not None:
    vocab = Vocab.Load(args.vocab)
  else:
    vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=params.min_vocab_count)

  if params.splitter == 'word':  # do the character vocab
    graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in vocab.GetWords()]
    char_vocab = Vocab.MakeFromData(graphemes, min_count=1)
    char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle'))
  else:
    char_vocab = None

  context_vocabs = {}  # do the context vocabs
  for i, context_var in enumerate(params.context_vars):
    # skip numerical vocabularies
    if hasattr(params, 'context_var_types') and params.context_var_types[i] == 'numerical':
      context_vocabs[context_var] = None
      continue

    v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
                           min_count=50, no_special_syms=True)
    context_vocabs[context_var] = v