def __init__(self, input_dim, in_channel, channels, kernel_sizes, dropout, bottleneck_dim=0, param_init=0.1): super(GatedConvEncoder, self).__init__() (channels, kernel_sizes, _, _), _ = parse_cnn_config(channels, kernel_sizes, '', '') self.in_channel = in_channel assert input_dim % in_channel == 0 self.input_freq = input_dim // in_channel self.bridge = None assert len(channels) > 0 assert len(channels) == len(kernel_sizes) layers = OrderedDict() for lth in range(len(channels)): layers['conv%d' % lth] = ConvGLUBlock(kernel_sizes[lth][0], input_dim, channels[lth], weight_norm=True, dropout=0.2) input_dim = channels[lth] # weight normalization + GLU for the last fully-connected layer self.fc_glu = nn.utils.weight_norm(nn.Linear(input_dim, input_dim * 2), name='weight', dim=0) self._odim = int(input_dim) if bottleneck_dim > 0: self.bridge = nn.Linear(self._odim, bottleneck_dim) self._odim = bottleneck_dim self.layers = nn.Sequential(layers) self._factor = 1 self.reset_parameters(param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.emb_dim = args.emb_dim self.n_units = args.n_units self.n_layers = args.n_layers self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad) self.dropout_embed = nn.Dropout(p=args.dropout_in) model_size = args.lm_type.replace('gated_conv_', '') blocks = OrderedDict() dropout = args.dropout_hidden if model_size == 'custom': blocks['conv1'] = ConvGLUBlock(args.kernel_size, args.emb_dim, args.n_units, bottlececk_dim=args.n_projs, dropout=dropout) for lth in range(args.n_layers - 1): blocks['conv%d' % (lth + 2)] = ConvGLUBlock( args.kernel_size, args.n_units, args.n_units, bottlececk_dim=args.n_projs, dropout=dropout) last_dim = args.n_units elif model_size == '8': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 900, dropout=dropout) for i in range(1, 8, 1): blocks['conv2-%d' % i] = ConvGLUBlock(4, 900, 900, dropout=dropout) last_dim = 900 elif model_size == '8B': blocks['conv1'] = ConvGLUBlock(1, args.emb_dim, 512, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=128, dropout=dropout) for i in range(1, 4, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=256, dropout=dropout) blocks['conv4'] = ConvGLUBlock(1, 512, 2048, bottlececk_dim=1024, dropout=dropout) last_dim = 2048 elif model_size == '9': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 807, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d-1' % i] = ConvGLUBlock(4, 807, 807, dropout=dropout) blocks['conv2-%d-2' % i] = ConvGLUBlock(4, 807, 807, dropout=dropout) last_dim = 807 elif model_size == '13': blocks['conv1'] = ConvGLUBlock(4, args.emb_dim, 1268, dropout=dropout) for i in range(1, 13, 1): blocks['conv2-%d' % i] = ConvGLUBlock(4, 1268, 1268, dropout=dropout) last_dim = 1268 elif model_size == '14': for i in range(1, 4, 1): blocks['conv1-%d' % i] = ConvGLUBlock( 6, args.emb_dim if i == 1 else 850, 850, dropout=dropout) blocks['conv2'] = ConvGLUBlock(1, 850, 850, dropout=dropout) for i in range(1, 5, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 850, 850, dropout=dropout) blocks['conv4'] = ConvGLUBlock(1, 850, 850, dropout=dropout) for i in range(1, 4, 1): blocks['conv5-%d' % i] = ConvGLUBlock(4, 850, 850, dropout=dropout) blocks['conv6'] = ConvGLUBlock(4, 850, 1024, dropout=dropout) blocks['conv7'] = ConvGLUBlock(4, 1024, 2048, dropout=dropout) last_dim = 2048 elif model_size == '14B': blocks['conv1'] = ConvGLUBlock(5, args.emb_dim, 512, dropout=dropout) for i in range(1, 4, 1): blocks['conv2-%d' % i] = ConvGLUBlock(5, 512, 512, bottlececk_dim=128, dropout=dropout) for i in range(1, 4, 1): blocks['conv3-%d' % i] = ConvGLUBlock(5, 512 if i == 1 else 1024, 1024, bottlececk_dim=512, dropout=dropout) for i in range(1, 7, 1): blocks['conv4-%d' % i] = ConvGLUBlock(5, 1024 if i == 1 else 2048, 2048, bottlececk_dim=1024, dropout=dropout) blocks['conv5'] = ConvGLUBlock(5, 2048, 4096, bottlececk_dim=1024, dropout=dropout) last_dim = 4096 else: raise NotImplementedError(model_size) self.blocks = nn.Sequential(blocks) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( last_dim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = nn.Linear(last_dim, self.vocab) if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.weight = self.embed.weight self.reset_parameters(args.param_init)