예제 #1
0
  def __init__(self, argv):
    self.prog = argv.pop(0)
    self.dnet = None

    log_file = 'stderr'
    log_level = 'info'

    while len(argv):
      tok = sys.argv.pop(0)
      if tok=="-h":
        self.usage()
      elif tok=="-dnet" and len(argv):
        self.dnet = argv.pop(0)
      elif tok=="-log_file" and len(argv):
        log_file = argv.pop(0)
      elif tok=="-log_level" and len(argv):
        log_level = argv.pop(0)

      else:
        self.usage('Unrecognized {} option'.format(tok))

    create_logger(log_file,log_level)
    if self.dnet is None:
      logging.error('missing -dnet option')
      self.usage()
예제 #2
0
    def __init__(self, argv):
        self.prog = argv.pop(0)
        self.dnet = None
        self.input = None
        self.prefix = None
        self.output = '-'
        self.model = None
        self.beam_size = 4
        self.n_best = 1
        self.max_size = 250
        self.alpha = 0.0
        self.format = 'pt'
        self.shard_size = 0
        self.max_length = 0
        self.mask_prefix = False
        self.batch_size = 30
        self.batch_type = 'sentences'
        self.cuda = False
        log_file = 'stderr'
        log_level = 'info'

        while len(argv):
            tok = argv.pop(0)

            if tok == "-h":
                self.usage()

            elif tok == '-dnet' and len(argv):
                self.dnet = argv.pop(0)
            elif tok == '-beam_size' and len(argv):
                self.beam_size = int(argv.pop(0))
            elif tok == '-n_best' and len(argv):
                self.n_best = int(argv.pop(0))
            elif tok == '-max_size' and len(argv):
                self.max_size = int(argv.pop(0))
            elif tok == '-alpha' and len(argv):
                self.alpha = float(argv.pop(0))
            elif tok == '-format' and len(argv):
                self.format = argv.pop(0)
            elif tok == '-i_src' and len(argv):
                self.input_src = argv.pop(0)
            elif tok == '-i_sim' and len(argv):
                self.input_sim = argv.pop(0)
            elif tok == '-i_pre' and len(argv):
                self.input_pre = argv.pop(0)
            elif tok == '-p' and len(argv):
                self.prefix = argv.pop(0)
            elif tok == '-o' and len(argv):
                self.output = argv.pop(0)
            elif tok == '-m' and len(argv):
                self.model = argv.pop(0)
            elif tok == '-shard_size' and len(argv):
                self.shard_size = int(argv.pop(0))
            elif tok == '-max_length' and len(argv):
                self.max_length = int(argv.pop(0))
            elif tok == '-batch_size' and len(argv):
                self.batch_size = int(argv.pop(0))
            elif tok == '-batch_type' and len(argv):
                self.batch_type = argv.pop(0)
            elif tok == '-mask_prefix':
                self.mask_prefix = True

            elif tok == "-cuda":
                self.cuda = True
            elif tok == "-log_file" and len(argv):
                log_file = argv.pop(0)
            elif tok == "-log_level" and len(argv):
                log_level = argv.pop(0)

            else:
                self.usage('Unrecognized {} option'.format(tok))

        if self.dnet is None:
            self.usage('missing -dnet option')

    # if self.input is None:
    #  self.usage('missing -i option')
        create_logger(log_file, log_level)
        logging.info("Options = {}".format(self.__dict__))
    def __init__(self, argv):
        self.prog = argv.pop(0)
        self.dnet = None
        self.src_voc = None
        self.tgt_voc = None
        self.net = {}  ### contains all network parameters
        self.net['emb_dim'] = 512
        self.net['qk_dim'] = 64
        self.net['v_dim'] = 64
        self.net['ff_dim'] = 2048
        self.net['n_heads'] = 8
        self.net['n_layers'] = 6
        self.net['dropout'] = 0.1
        self.net['share_embeddings'] = False
        self.net['weight_decay'] = 0.0
        self.net['beta1'] = 0.9
        self.net['beta2'] = 0.998
        self.net['eps'] = 1e-9
        log_file = 'stderr'
        log_level = 'info'

        while len(argv):
            tok = sys.argv.pop(0)
            if tok == "-h":
                self.usage()

            elif tok == "-dnet" and len(argv):
                self.dnet = argv.pop(0)
            elif tok == "-src_voc" and len(argv):
                self.src_voc = argv.pop(0)
            elif tok == "-tgt_voc" and len(argv):
                self.tgt_voc = argv.pop(0)

            elif tok == "-emb_dim" and len(argv):
                self.net['emb_dim'] = int(argv.pop(0))
            elif tok == "-qk_dim" and len(argv):
                self.net['qk_dim'] = int(argv.pop(0))
            elif tok == "-v_dim" and len(argv):
                self.net['v_dim'] = int(argv.pop(0))
            elif tok == "-ff_dim" and len(argv):
                self.net['ff_dim'] = int(argv.pop(0))
            elif tok == "-n_heads" and len(argv):
                self.net['n_heads'] = int(argv.pop(0))
            elif tok == "-n_layers" and len(argv):
                self.net['n_layers'] = int(argv.pop(0))
            elif tok == "-dropout" and len(argv):
                self.net['dropout'] = float(argv.pop(0))
            elif tok == "-share_embeddings":
                self.net['share_embeddings'] = True
            elif tok == '-weight_decay' and len(argv):
                self.net['weight_decay'] = float(argv.pop(0))
            elif tok == '-beta1' and len(argv):
                self.net['beta1'] = float(argv.pop(0))
            elif tok == '-beta2' and len(argv):
                self.net['beta2'] = float(argv.pop(0))
            elif tok == '-eps' and len(argv):
                self.net['eps'] = float(argv.pop(0))

            elif tok == "-log_file" and len(argv):
                log_file = argv.pop(0)
            elif tok == "-log_level" and len(argv):
                log_level = argv.pop(0)

            else:
                self.usage('Unrecognized {} option'.format(tok))

        create_logger(log_file, log_level)
        if self.dnet is None:
            logging.error('missing -dnet option')
            self.usage()
        if self.src_voc is None:
            logging.error('missing -src_voc option')
            self.usage()
        if self.tgt_voc is None:
            logging.error('missing -tgt_voc option')
            self.usage()
예제 #4
0
  while len(sys.argv):
    tok = sys.argv.pop(0)
    if tok=="-h":
      sys.stderr.write(usage);
      sys.exit()
    elif tok=="-min_freq":
      min_freq = int(sys.argv.pop(0))
    elif tok=="-max_size":
      max_size = int(sys.argv.pop(0))

    else:
      sys.stderr.write('Unrecognized {} option\n'.format(tok))
      sys.stderr.write(usage)
      sys.exit()

  create_logger(None, 'info')
  logging.info('min_freq = {}'.format(min_freq))
  logging.info('max_size = {}'.format(max_size))

  ###################
  ### count words ###
  ###################
  lflat = []
  ll = [l.split() for l in sys.stdin.readlines()]
  list(map(lflat.extend, ll))
  freq = Counter(lflat)

  #######################
  ### dump vocabulary ###
  #######################
  print('<pad>')
예제 #5
0
    def __init__(self, argv):
        self.prog = argv.pop(0)
        self.dnet = None
        self.src_train = None
        self.tgt_train = None
        self.src_valid = None
        self.tgt_valid = None
        ### learning
        self.max_steps = 0
        self.max_epochs = 0
        self.validate_every = 5000
        self.save_every = 5000
        self.report_every = 100
        self.keep_last_n = 5
        self.mask_prefix = False
        ### optim
        self.noam_scale = 2.0
        self.noam_warmup = 4000
        self.label_smoothing = 0.1
        self.loss = 'NLL'
        self.clip = 0.5
        ### data
        self.shard_size = 500000
        self.max_length = 100
        self.batch_size = 4096
        self.batch_type = 'tokens'

        self.cuda = False
        self.seed = 12345
        log_file = 'stderr'
        log_level = 'info'

        while len(argv):
            tok = argv.pop(0)
            if tok == "-h":
                self.usage()

            elif tok == '-dnet' and len(argv):
                self.dnet = argv.pop(0)
                self.dnet = self.dnet[:-1] if self.dnet[
                    -1] == '/' else self.dnet  ### remove trailing '/'
            elif tok == '-max_steps':
                self.max_steps = int(argv.pop(0))
            elif tok == '-max_epochs':
                self.max_epochs = int(argv.pop(0))
            elif tok == '-validate_every':
                self.validate_every = int(argv.pop(0))
            elif tok == '-save_every':
                self.save_every = int(argv.pop(0))
            elif tok == '-report_every':
                self.report_every = int(argv.pop(0))
            elif tok == '-keep_last_n':
                self.keep_last_n = int(argv.pop(0))
            elif tok == '-mask_prefix':
                self.mask_prefix = True
            elif tok == '-noam_scale':
                self.noam_scale = float(argv.pop(0))
            elif tok == '-noam_warmup':
                self.noam_warmup = float(argv.pop(0))
            elif tok == '-label_smoothing':
                self.label_smoothing = float(argv.pop(0))
            elif tok == '-loss':
                self.loss = argv.pop(0)
            elif tok == '-clip':
                self.clip = float(argv.pop(0))

            elif tok == '-src_train':
                self.src_train = argv.pop(0)
            elif tok == '-sim_train':
                self.sim_train = argv.pop(0)
            elif tok == '-pre_train':
                self.pre_train = argv.pop(0)
            elif tok == '-tgt_train':
                self.tgt_train = argv.pop(0)
            elif tok == '-src_valid':
                self.src_valid = argv.pop(0)
            elif tok == '-sim_valid':
                self.sim_valid = argv.pop(0)
            elif tok == '-pre_valid':
                self.pre_valid = argv.pop(0)
            elif tok == '-tgt_valid':
                self.tgt_valid = argv.pop(0)
            elif tok == '-shard_size':
                self.shard_size = int(argv.pop(0))
            elif tok == '-max_length':
                self.max_length = int(argv.pop(0))
            elif tok == '-batch_size':
                self.batch_size = int(argv.pop(0))
            elif tok == '-batch_type':
                self.batch_type = argv.pop(0)

            elif tok == "-cuda":
                self.cuda = True
            elif tok == "-seed":
                self.seed = int(argv.pop(0))
            elif tok == "-log_file" and len(argv):
                log_file = argv.pop(0)
            elif tok == "-log_level" and len(argv):
                log_level = argv.pop(0)

            else:
                self.usage('Unrecognized {} option'.format(tok))

        if self.dnet is None:
            self.usage('missing -dnet option')

        if self.src_train is None or self.tgt_train is None:
            self.usage('missing -src_train/-tgt_train options')

        create_logger(log_file, log_level)
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        logging.info("Options = {}".format(self.__dict__))