Пример #1
0
  params.batch_size = 20

SEPERATOR = ' '
if params.splitter == 'char':
  SEPERATOR = ''

if args.mode in ('train', 'eval', 'classify'):
  mode = args.mode
  if args.partition_override:
    mode = 'all'

  dataset = Dataset(max_len=params.max_len + 1, 
                    preshuffle=args.mode=='train',
                    batch_size=params.batch_size)
  print 'reading data'
  dataset.ReadData(args.data, params.context_vars + ['text'],
                   mode=mode, splitter=params.splitter)

if args.mode == 'train':
  if args.vocab is not None:
    vocab = Vocab.Load(args.vocab)
  else:
    min_count = 20
    if hasattr(params, 'min_vocab_count'):
      min_count = params.min_vocab_count
    vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count)
  context_vocabs = {}
  for context_var in params.context_vars:
    v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
                           min_count=50, no_special_syms=True)
    context_vocabs[context_var] = v
    print 'num {0}: {1}'.format(context_var, len(v))
Пример #2
0
    params.batch_size = 1

SEPERATOR = ' '
if params.splitter == 'char':
    SEPERATOR = ''

if args.mode in ('train', 'eval', 'classify', 'uniclass', 'geoclass'):
    mode = args.mode

    dataset = Dataset(max_len=params.max_len + 1,
                      preshuffle=args.mode == 'train',
                      batch_size=params.batch_size)
    print 'reading data'
    dataset.ReadData(args.data,
                     params.context_vars + ['text'],
                     splitter=params.splitter,
                     valdata=args.valdata,
                     types=params.context_var_types)

if args.mode == 'train':
    # do the word vocab
    if args.vocab is not None:
        vocab = Vocab.Load(args.vocab)
    else:
        vocab = Vocab.MakeFromData(dataset.GetColumn('text'),
                                   min_count=params.min_vocab_count)

    if params.splitter == 'word':  # do the character vocab
        graphemes = [['{'] + Vocab.Graphemes(x) + ['}']
                     for x in vocab.GetWords()]
        char_vocab = Vocab.MakeFromData(graphemes, min_count=1)