예제 #1
0
def main(params):
  batch_size = params['batch_size']
  dataset = params['dataset']
  word_count_threshold = params['word_count_threshold']
  do_grad_check = params['do_grad_check']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  params['mode'] = 'CPU'

  # fetch the data provider
  dp = getDataProvider(dataset)

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  # delegate the initialization of the model to the Generator class
  BatchGenerator = decodeGenerator(params)
  init_struct = BatchGenerator.init(params, misc)
  model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize'])
  
  if params['mode'] == 'GPU':
    # force overwrite here. This is a bit of a hack, not happy about it
    model['bd'] = gp.garray(bias_init_vector.reshape(1, bias_init_vector.size))
  else:
    model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())
  print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
  print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
  print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

  # initialize the Solver and the cost function
  solver = Solver()
  def costfun(batch, model):
    # wrap the cost function to abstract some things away from the Solver
    return RNNGenCost(batch, model, params, misc)

  # calculate how many iterations we need
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  abort = False
  top_val_ppl2 = -1
  smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []
  max_iters = 1
  for it in xrange(max_iters):
    if abort: break
    t0 = time.time()
    # fetch a batch of data
    batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
    # evaluate cost, gradient and perform parameter update
    step_struct = solver.step(batch, model, costfun, **params)
    cost = step_struct['cost']
    dt = time.time() - t0

    # print training statistics
    train_ppl2 = step_struct['stats']['ppl2']
    smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
    if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out
    epoch = it * 1.0 / num_iters_one_epoch
    print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
          % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
             train_ppl2, smooth_train_ppl2)

    # perform gradient check if desired, with a bit of a burnin time (10 iterations)
    #if it == 10 and do_grad_check:
    #  solver.gradCheck(batch, model, costfun)
    #  print 'done gradcheck. continue?'
    #  raw_input()
    #
    ## detect if loss is exploding and kill the job if so
    #total_cost = cost['total_cost']
    #if it == 0:
    #  total_cost0 = total_cost # store this initial cost
    #if total_cost > total_cost0 * 2:
    #  print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?'
    #  abort = True # set the abort flag, we'll break out
    #
    ## logging: write JSON files for visual inspection of the training
    #tnow = time.time()
    #if tnow > last_status_write_time + 60*1: # every now and then lets write a report
    #  last_status_write_time = tnow
    #  jstatus = {}
    #  jstatus['time'] = datetime.datetime.now().isoformat()
    #  jstatus['iter'] = (it, max_iters)
    #  jstatus['epoch'] = (epoch, max_epochs)
    #  jstatus['time_per_batch'] = dt
    #  jstatus['smooth_train_ppl2'] = smooth_train_ppl2
    #  jstatus['val_ppl2'] = val_ppl2 # just write the last available one
    #  jstatus['train_ppl2'] = train_ppl2
    #  json_worker_status['history'].append(jstatus)
    #  status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
    #  try:
    #    json.dump(json_worker_status, open(status_file, 'w'))
    #  except Exception, e: # todo be more clever here
    #    print 'tried to write worker status into %s but got error:' % (status_file, )
    #    print e
    #
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    #is_last_iter = (it+1) == max_iters
    #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
    #  val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set
    #  print 'validation perplexity = %f' % (val_ppl2, )
    #  write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
    #  if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
    #    if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
    #      # if we beat a previous record or if this is the first time
    #      # AND we also beat the user-defined threshold or it doesnt exist
    #      top_val_ppl2 = val_ppl2
    #      filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2)
    #      filepath = os.path.join(params['checkpoint_output_directory'], filename)
    #      checkpoint = {}
    #      checkpoint['it'] = it
    #      checkpoint['epoch'] = epoch
    #      checkpoint['model'] = model
    #      checkpoint['params'] = params
    #      checkpoint['perplexity'] = val_ppl2
    #      checkpoint['wordtoix'] = misc['wordtoix']
    #      checkpoint['ixtoword'] = misc['ixtoword']
    #      try:
    #        pickle.dump(checkpoint, open(filepath, "wb"))
    #        print 'saved checkpoint in %s' % (filepath, )
    #      except Exception, e: # todo be more clever here
    #        print 'tried to write checkpoint into %s but got error: ' % (filepat, )
    #        print e
    cuda.close()
예제 #2
0
def main(params):
    batch_size = params['batch_size']
    dataset = params['dataset']
    word_count_threshold = params['word_count_threshold']
    do_grad_check = params['do_grad_check']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    params['mode'] = 'CPU'

    # fetch the data provider
    dp = getDataProvider(dataset)

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)
    # delegate the initialization of the model to the Generator class
    BatchGenerator = decodeGenerator(params)
    init_struct = BatchGenerator.init(params, misc)
    model, misc['update'], misc['regularize'] = (init_struct['model'],
                                                 init_struct['update'],
                                                 init_struct['regularize'])

    if params['mode'] == 'GPU':
        # force overwrite here. This is a bit of a hack, not happy about it
        model['bd'] = gp.garray(
            bias_init_vector.reshape(1, bias_init_vector.size))
    else:
        model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['update'])
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['regularize'])
    print 'number of learnable parameters total: %d' % (sum(
        model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    # initialize the Solver and the cost function
    solver = Solver()

    def costfun(batch, model):
        # wrap the cost function to abstract some things away from the Solver
        return RNNGenCost(batch, model, params, misc)

    # calculate how many iterations we need
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    abort = False
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []
    max_iters = 1
    for it in xrange(max_iters):
        if abort: break
        t0 = time.time()
        # fetch a batch of data
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        # evaluate cost, gradient and perform parameter update
        step_struct = solver.step(batch, model, costfun, **params)
        cost = step_struct['cost']
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
              % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
                 train_ppl2, smooth_train_ppl2)

        # perform gradient check if desired, with a bit of a burnin time (10 iterations)
        #if it == 10 and do_grad_check:
        #  solver.gradCheck(batch, model, costfun)
        #  print 'done gradcheck. continue?'
        #  raw_input()
        #
        ## detect if loss is exploding and kill the job if so
        #total_cost = cost['total_cost']
        #if it == 0:
        #  total_cost0 = total_cost # store this initial cost
        #if total_cost > total_cost0 * 2:
        #  print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?'
        #  abort = True # set the abort flag, we'll break out
        #
        ## logging: write JSON files for visual inspection of the training
        #tnow = time.time()
        #if tnow > last_status_write_time + 60*1: # every now and then lets write a report
        #  last_status_write_time = tnow
        #  jstatus = {}
        #  jstatus['time'] = datetime.datetime.now().isoformat()
        #  jstatus['iter'] = (it, max_iters)
        #  jstatus['epoch'] = (epoch, max_epochs)
        #  jstatus['time_per_batch'] = dt
        #  jstatus['smooth_train_ppl2'] = smooth_train_ppl2
        #  jstatus['val_ppl2'] = val_ppl2 # just write the last available one
        #  jstatus['train_ppl2'] = train_ppl2
        #  json_worker_status['history'].append(jstatus)
        #  status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
        #  try:
        #    json.dump(json_worker_status, open(status_file, 'w'))
        #  except Exception, e: # todo be more clever here
        #    print 'tried to write worker status into %s but got error:' % (status_file, )
        #    print e
        #
        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        #is_last_iter = (it+1) == max_iters
        #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
        #  val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set
        #  print 'validation perplexity = %f' % (val_ppl2, )
        #  write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
        #  if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        #    if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
        #      # if we beat a previous record or if this is the first time
        #      # AND we also beat the user-defined threshold or it doesnt exist
        #      top_val_ppl2 = val_ppl2
        #      filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2)
        #      filepath = os.path.join(params['checkpoint_output_directory'], filename)
        #      checkpoint = {}
        #      checkpoint['it'] = it
        #      checkpoint['epoch'] = epoch
        #      checkpoint['model'] = model
        #      checkpoint['params'] = params
        #      checkpoint['perplexity'] = val_ppl2
        #      checkpoint['wordtoix'] = misc['wordtoix']
        #      checkpoint['ixtoword'] = misc['ixtoword']
        #      try:
        #        pickle.dump(checkpoint, open(filepath, "wb"))
        #        print 'saved checkpoint in %s' % (filepath, )
        #      except Exception, e: # todo be more clever here
        #        print 'tried to write checkpoint into %s but got error: ' % (filepat, )
        #        print e
        cuda.close()
def main(params):
    batch_size = params['batch_size']
    dataset = params['dataset']  # name of the dataset flickr8k, flickr30k..
    word_count_threshold = params['word_count_threshold']
    do_grad_check = params['do_grad_check']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(dataset)
    completeData = dp.getData('train')

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    #print 'dp.iterSentences', dp.iterSentences('train')
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)
    #printWordEmbedding(dp.iterSentences('train'),misc['wordtoix'])

    #print 'type;',type(completeData)
    # calculate weights of all unique words in vocab
    weightComputedData = calculateWeights(misc['wordtoix'], misc['ixtoword'],
                                          completeData)

    weightCalculationMethodSec()
    weightComputedData = getWeightsMethod2()
    print 'Done:'

    # delegate the initialization of the model to the Generator class
    BatchGenerator = GenericBatchGenerator()
    #decodeGenerator(params)

    # initialize encoder and decoder weight matrices
    init_struct = BatchGenerator.init(params, misc)
    model, misc['update'], misc['regularize'] = (init_struct['model'],
                                                 init_struct['update'],
                                                 init_struct['regularize'])

    # force overwrite here. This is a bit of a hack, not happy about it
    model['bd'] = bias_init_vector.reshape(
        1, bias_init_vector.size)  # remove and check

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['update'])
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['regularize'])
    print 'number of learnable parameters total: %d' % (sum(
        model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    if params.get('init_model_from', ''):
        # load checkpoint
        checkpoint = pickle.load(open(params['init_model_from'], 'rb'))
        model = checkpoint['model']  # overwrite the model

    # initialize the Solver and the cost function
    solver = Solver()

    def costfun(batch, model):
        # wrap the cost function to abstract some things away from the Solver
        return RNNGenCost(batch, model, params, misc, weightComputedData)

    # calculate how many iterations we need
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    abort = False
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []
    for it in xrange(max_iters):
        if abort: break
        t0 = time.time()
        # fetch a batch of data
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        # evaluate cost, gradient and perform parameter update
        step_struct = solver.step(batch, model, costfun, **params)
        cost = step_struct['cost']
        dt = time.time() - t0

        # print training statistics
        #train_ppl2 = step_struct['stats']['ppl2']
        #if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out

        epoch = it * 1.0 / num_iters_one_epoch
        print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f' \
              % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'])

        total_cost = cost['total_cost']
        if it == 0:
            total_cost0 = total_cost
        if total_cost > total_cost0 * 2:
            print 'Aborting, cost seems to be exploding. '
            abort = True

        if (it + 1) == max_iters:
            top_val_ppl2 = val_ppl2
            filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (
                dataset, host, params['fappend'], val_ppl2)
            filepath = os.path.join(params['checkpoint_output_directory'],
                                    filename)
            checkpoint = {}
            checkpoint['it'] = it
            checkpoint['epoch'] = epoch
            checkpoint['model'] = model
            checkpoint['params'] = params
            checkpoint['perplexity'] = val_ppl2
            checkpoint['wordtoix'] = misc['wordtoix']
            checkpoint['ixtoword'] = misc['ixtoword']
            try:
                pickle.dump(checkpoint, open(filepath, "wb"))
                print 'saved checkpoint in %s' % (filepath, )
            except Exception, e:
                print 'tried to write checkpoint into %s but got error: ' % (
                    filepath, )
                print e
예제 #4
0
def main(params):
  batch_size = params['batch_size']
  dataset = params['dataset']
  word_count_threshold = params['word_count_threshold']
  do_grad_check = params['do_grad_check']
  max_epochs = params['max_epochs']


  # fetch the data provider
  dp = getDataProvider(dataset)

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)

  # delegate the initialization of the model to the Generator class
  BatchGenerator = decodeGenerator(params)
  init_struct = BatchGenerator.init(params, misc)
  model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize'])

  # force overwrite here. This is a bit of a hack, not happy about it
  model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())
  print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
  print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
  print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

  if params.get('init_model_from', ''):
    # load checkpoint
    checkpoint = pickle.load(open(params['init_model_from'], 'rb'))
    model = checkpoint['model'] # overwrite the model

  # initialize the Solver and the cost function
  solver = Solver()
  def costfun(batch, model):
    # wrap the cost function to abstract some things away from the Solver
    return RNNGenCost(batch, model, params, misc)

  # calculate how many iterations we need
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  abort = False
  top_val_ppl2 = -1
  smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  import csv
  csvfile = open(os.path.join(params['outdir'],params['generator']+'.csv'),'wb')
  csvout = csv.writer(csvfile,delimiter=',',quotechar='"')

  csv_val_file = open(os.path.join(params['outdir'],params['generator']+'_val.csv'),'wb')
  csv_val_out = csv.writer(csv_val_file,delimiter=',',quotechar='"')

  for it in xrange(max_iters):
    if abort: break
    t0 = time.time()
    # fetch a batch of data
    batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
    # evaluate cost, gradient and perform parameter update
    step_struct = solver.step(batch, model, costfun, **params)
    cost = step_struct['cost']
    dt = time.time() - t0

    # print training statistics
    train_ppl2 = step_struct['stats']['ppl2']
    smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
    if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out
    epoch = it * 1.0 / num_iters_one_epoch
    print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
          % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
             train_ppl2, smooth_train_ppl2)

    csvout.writerow([it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'],train_ppl2, smooth_train_ppl2])
    csvfile.flush()

    if not host=='oliver-Aurora-R4':
      sys.stdout.flush()

    # os.system('./update_plots.sh')

    # perform gradient check if desired, with a bit of a burnin time (10 iterations)
    if it == 10 and do_grad_check:
      print 'disabling dropout for gradient check...'
      params['drop_prob_encoder'] = 0
      params['drop_prob_decoder'] = 0
      solver.gradCheck(batch, model, costfun)
      print 'done gradcheck, exitting.'
      sys.exit() # hmmm. probably should exit here

    # detect if loss is exploding and kill the job if so
    total_cost = cost['total_cost']
    if it == 0:
      total_cost0 = total_cost # store this initial cost
    if total_cost > total_cost0 * 2:
      print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?'
      abort = True # set the abort flag, we'll break out

    # logging: write JSON files for visual inspection of the training
    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['smooth_train_ppl2'] = smooth_train_ppl2
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      jstatus['train_ppl2'] = train_ppl2
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e

    # perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set
      print 'validation perplexity = %f' % (val_ppl2, )

      cp_pred = {}
      cp_pred['it'] = it
      cp_pred['epoch'] = epoch
      cp_pred['model'] = model
      cp_pred['params'] = params
      cp_pred['perplexity'] = val_ppl2
      cp_pred['wordtoix'] = misc['wordtoix']
      cp_pred['ixtoword'] = misc['ixtoword']
      cp_pred['algorithm'] = params['generator']
      cp_pred['outdir'] = params['outdir']

      if is_last_iter:
        scores = eval_sentence_predictions.run(cp_pred)
        csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]])
        csv_val_file.flush()
	omail.send('job finished'+params['generator'],'done')


      # abort training if the perplexity is no good
      min_ppl_or_abort = params['min_ppl_or_abort']
      if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0:
        print 'aborting job because validation perplexity %f < %f' % (val_ppl2, min_ppl_or_abort)
        abort = True # abort the job

      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if  val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          top_val_ppl2 = val_ppl2
          filename = 'model_%s_checkpoint_%s_%s_%s_%.2f.p' % (params['generator'],dataset, host, params['fappend'], val_ppl2)
          filepath = os.path.join(params['outdir'], filename)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']

          checkpoint['algorithm'] = params['generator']
          checkpoint['outdir'] = params['outdir']

          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepat, )
            print e

          scores = eval_sentence_predictions.run(checkpoint)
          csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]])
          csv_val_file.flush()
예제 #5
0
def main(params, split):

    #import pdb; pdb.set_trace()

    batch_size = params['batch_size']
    dataset = params['dataset']
    feature_file = params['feature_file']
    class_count_threshold = params['class_count_threshold']
    do_grad_check = params['do_grad_check']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    json_file = 'dataset_mmdb_book_fps_30_samplesize_25_split_%d.json' % (
        split)

    # fetch the data provider
    dp = getDataProvider(dataset, feature_file, json_file)

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training classes and find the vocabulary we want to use, i.e. the classes that occur
    # at least class_count_threshold number of times
    misc['classtoix'], misc[
        'ixtoclass'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), class_count_threshold)

    # delegate the initialization of the model to the Generator class
    BatchGenerator = decodeGenerator(params)
    init_struct = BatchGenerator.init(params, misc)
    model, misc['update'], misc['regularize'] = (init_struct['model'],
                                                 init_struct['update'],
                                                 init_struct['regularize'])

    # force overwrite here. This is a bit of a hack, not happy about it
    model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['update'])
    print 'updating: ' + ', '.join('%s [%dx%d]' %
                                   (k, model[k].shape[0], model[k].shape[1])
                                   for k in misc['regularize'])
    print 'number of learnable parameters total: %d' % (sum(
        model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    if params.get('init_model_from', ''):
        # load checkpoint
        checkpoint = pickle.load(open(params['init_model_from'], 'rb'))
        model = checkpoint['model']  # overwrite the model

    # initialize the Solver and the cost function
    solver = Solver()

    def costfun(batch, model):
        # wrap the cost function to abstract some things away from the Solver
        return RNNGenCost(batch, model, params, misc)

    # calculate how many iterations we need
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    abort = False
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoclass'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoclass'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []
    lastsavedcheckpoint = ''
    for it in xrange(max_iters):
        if abort: break
        t0 = time.time()
        # fetch a batch of data
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        # evaluate cost, gradient and perform parameter update
        step_struct = solver.step(batch, model, costfun, **params)
        cost = step_struct['cost']
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
              % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
                 train_ppl2, smooth_train_ppl2)

        print 'last saved checkpoint in %s' % (lastsavedcheckpoint, )
        # perform gradient check if desired, with a bit of a burnin time (10 iterations)
        if it == 10 and do_grad_check:
            print 'disabling dropout for gradient check...'
            params['drop_prob_encoder'] = 0
            params['drop_prob_decoder'] = 0
            solver.gradCheck(batch, model, costfun)
            print 'done gradcheck, exitting.'
            sys.exit()  # hmmm. probably should exit here

        # detect if loss is exploding and kill the job if so
        total_cost = cost['total_cost']
        if it == 0:
            total_cost0 = total_cost  # store this initial cost
        if total_cost > total_cost0 * 2:
            print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?'
            abort = True  # set the abort flag, we'll break out

        # logging: write JSON files for visual inspection of the training
        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        # perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            val_ppl2 = eval_split('val', dp, model, params,
                                  misc)  # perform the evaluation on VAL set
            print 'validation perplexity = %f' % (val_ppl2, )

            # abort training if the perplexity is no good
            min_ppl_or_abort = params['min_ppl_or_abort']
            if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0:
                print 'aborting job because validation perplexity %f < %f' % (
                    val_ppl2, min_ppl_or_abort)
                abort = True  # abort the job

            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2

                    filename = 'model_checkpoint_%s_%s_%s_alpha_%2.2f_beta_%2.2f_split_%d.p' % (
                        dataset, host, params['fappend'], params['alpha'],
                        params['beta'], split)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['classtoix'] = misc['classtoix']
                    checkpoint['ixtoclass'] = misc['ixtoclass']
                    checkpoint['json_file'] = json_file

                    try:
                        if not (params['fappend'] == 'test'):
                            # if it == max_iters - 1 :
                            pickle.dump(checkpoint, open(filepath, "wb"))
                            print 'saved checkpoint in %s' % (filepath, )
                            lastsavedcheckpoint = filepath
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e
예제 #6
0
def main(params):
    batch_size = params["batch_size"]
    dataset = params["dataset"]
    word_count_threshold = params["word_count_threshold"]
    do_grad_check = params["do_grad_check"]
    max_epochs = params["max_epochs"]
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(dataset)

    misc = {}  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc["wordtoix"], misc["ixtoword"], bias_init_vector = preProBuildWordVocab(
        dp.iterSentences("train"), word_count_threshold
    )

    # delegate the initialization of the model to the Generator class
    BatchGenerator = decodeGenerator(params)
    init_struct = BatchGenerator.init(params, misc)
    model, misc["update"], misc["regularize"] = (init_struct["model"], init_struct["update"], init_struct["regularize"])

    # force overwrite here. This is a bit of a hack, not happy about it
    model["bd"] = bias_init_vector.reshape(1, bias_init_vector.size)

    print "model init done."
    print "model has keys: " + ", ".join(model.keys())
    print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["update"])
    print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["regularize"])
    print "number of learnable parameters total: %d" % (
        sum(model[k].shape[0] * model[k].shape[1] for k in misc["update"]),
    )

    if params.get("init_model_from", ""):
        # load checkpoint
        checkpoint = pickle.load(open(params["init_model_from"], "rb"))
        model = checkpoint["model"]  # overwrite the model
        print checkpoint["model"]

    # initialize the Solver and the cost function
    solver = Solver()

    def costfun(batch, model):
        # wrap the cost function to abstract some things away from the Solver
        return RNNGenCost(batch, model, params, misc)

    # calculate how many iterations we need
    num_sentences_total = dp.getSplitSize("train", ofwhat="sentences")
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params["eval_period"]
    eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
    abort = False
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(misc["ixtoword"])  # initially size of dictionary of confusion
    val_ppl2 = len(misc["ixtoword"])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status["params"] = params
    json_worker_status["history"] = []
    for it in xrange(max_iters):
        if abort:
            break
        t0 = time.time()
        # fetch a batch of data
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        # evaluate cost, gradient and perform parameter update
        step_struct = solver.step(batch, model, costfun, **params)
        cost = step_struct["cost"]
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = step_struct["stats"]["ppl2"]
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        print "%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)" % (
            it,
            max_iters,
            dt,
            epoch,
            cost["loss_cost"],
            cost["reg_cost"],
            train_ppl2,
            smooth_train_ppl2,
        )

        # perform gradient check if desired, with a bit of a burnin time (10 iterations)
        if it == 10 and do_grad_check:
            print "disabling dropout for gradient check..."
            params["drop_prob_encoder"] = 0
            params["drop_prob_decoder"] = 0
            solver.gradCheck(batch, model, costfun)
            print "done gradcheck, exitting."
            sys.exit()  # hmmm. probably should exit here

        # detect if loss is exploding and kill the job if so
        total_cost = cost["total_cost"]
        if it == 0:
            total_cost0 = total_cost  # store this initial cost
        if total_cost > total_cost0 * 2:
            print "Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?"
            abort = True  # set the abort flag, we'll break out

        # logging: write JSON files for visual inspection of the training
        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            last_status_write_time = tnow
            jstatus = {}
            jstatus["time"] = datetime.datetime.now().isoformat()
            jstatus["iter"] = (it, max_iters)
            jstatus["epoch"] = (epoch, max_epochs)
            jstatus["time_per_batch"] = dt
            jstatus["smooth_train_ppl2"] = smooth_train_ppl2
            jstatus["val_ppl2"] = val_ppl2  # just write the last available one
            jstatus["train_ppl2"] = train_ppl2
            json_worker_status["history"].append(jstatus)
            status_file = os.path.join(params["worker_status_output_directory"], host + "_status.json")
            try:
                json.dump(json_worker_status, open(status_file, "w"))
            except Exception, e:  # todo be more clever here
                print "tried to write worker status into %s but got error:" % (status_file,)
                print e

        # perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
            val_ppl2 = eval_split("val", dp, model, params, misc)  # perform the evaluation on VAL set
            print "validation perplexity = %f" % (val_ppl2,)

            # abort training if the perplexity is no good
            min_ppl_or_abort = params["min_ppl_or_abort"]
            if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0:
                print "aborting job because validation perplexity %f < %f" % (val_ppl2, min_ppl_or_abort)
                abort = True  # abort the job

            write_checkpoint_ppl_threshold = params["write_checkpoint_ppl_threshold"]
            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = "model_checkpoint_%s_%s_%s_%.2f.p" % (dataset, host, params["fappend"], val_ppl2)
                    filepath = os.path.join(params["checkpoint_output_directory"], filename)
                    checkpoint = {}
                    checkpoint["it"] = it
                    checkpoint["epoch"] = epoch
                    checkpoint["model"] = model
                    checkpoint["params"] = params
                    checkpoint["perplexity"] = val_ppl2
                    checkpoint["wordtoix"] = misc["wordtoix"]
                    checkpoint["ixtoword"] = misc["ixtoword"]
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print "saved checkpoint in %s" % (filepath,)
                    except Exception, e:  # todo be more clever here
                        print "tried to write checkpoint into %s but got error: " % (filepat,)
                        print e