def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super().__init__() self.seq2seq = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=hidden_dim, projection_dim=projection_dim, feedforward_hidden_dim=feedforward_hidden_dim, num_layers=num_layers, num_attention_heads=num_attention_heads, use_positional_encoding=use_positional_encoding, dropout_prob=dropout_prob, residual_dropout_prob=residual_dropout_prob, attention_dropout_prob=attention_dropout_prob) self.hidden_dim = hidden_dim self.input_dim = input_dim
def transformer_seq2seq(input_dim: int, model_dim: int, feedforward_hidden_dim: int = 2048, num_layers: int = 6, projection_dim: int = 64, num_attention_heads: int = 8, ttype: str = 'custom', dropout: float = 0.1) -> Seq2SeqEncoder: if ttype == 'custom': return TransformerEncoder( input_dim=input_dim, model_dim=model_dim, feedforward_hidden_dim=feedforward_hidden_dim, num_layers=num_layers, num_attention_heads=num_attention_heads, dropout_prob=dropout ) elif ttype == 'allen': return StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=model_dim, feedforward_hidden_dim=feedforward_hidden_dim, num_layers=num_layers, num_attention_heads=num_attention_heads, projection_dim=model_dim, dropout_prob=dropout ) else: raise ValueError(f'Invalid transformer type {ttype}')
def get_encoder(input_dim, output_dim, encoder_type, args): if encoder_type == "pass": return PassThroughEncoder(input_dim) if encoder_type == "bilstm": return PytorchSeq2SeqWrapper( AllenNLPSequential(torch.nn.ModuleList( [get_encoder(input_dim, output_dim, "bilstm-unwrapped", args)]), input_dim, output_dim, bidirectional=True, residual_connection=args.residual_connection, dropout=args.dropout)) if encoder_type == "bilstm-unwrapped": return torch.nn.LSTM( input_dim, output_dim, batch_first=True, bidirectional=True, dropout=args.dropout, ) if encoder_type == "self_attention": return IntraSentenceAttentionEncoder(input_dim=input_dim, projection_dim=output_dim) if encoder_type == "stacked_self_attention": return StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=output_dim, projection_dim=output_dim, feedforward_hidden_dim=output_dim, num_attention_heads=5, num_layers=3, dropout_prob=args.dropout, ) raise RuntimeError(f"Unknown encoder type={encoder_type}")
def __init__(self, args, input_dim, hidden_dim, word_embedder): super(DefinitionSentenceEncoder, self).__init__() self.config = args self.args = args self.input_dim = input_dim self.hidden_dim = hidden_dim self.projection_dim = input_dim self.feedforward_hidden_dim = input_dim self.num_layers = self.args.num_layers_for_stackatt self.num_attention_heads = self.args.num_atthead_for_stackatt self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) self.mentiontransformer = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=hidden_dim, projection_dim=self.projection_dim, feedforward_hidden_dim=self.feedforward_hidden_dim, num_layers=self.num_layers, num_attention_heads=self.num_attention_heads) self.senttransformer = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=hidden_dim, projection_dim=self.projection_dim, feedforward_hidden_dim=self.feedforward_hidden_dim, num_layers=self.num_layers, num_attention_heads=self.num_attention_heads) self.ff_seq2vecs = nn.Linear(input_dim, input_dim) self.rnn = PytorchSeq2VecWrapper( nn.LSTM(bidirectional=True, num_layers=2, input_size=input_dim, hidden_size=hidden_dim // 2, batch_first=True, dropout=self.args.lstmdropout)) self.bow = BagOfEmbeddingsEncoder(input_dim, self.args.bow_avg)
def test_get_dimension_is_correct(self): encoder = StackedSelfAttentionEncoder(input_dim=9, hidden_dim=12, projection_dim=7, feedforward_hidden_dim=5, num_layers=3, num_attention_heads=3) assert encoder.get_input_dim() == 9 # hidden_dim + projection_dim assert encoder.get_output_dim() == 12
def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int, win_size: int, max_windows: int): super(TK_v2, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.nn_scaler = nn.ParameterList([ nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) for w in win_size ]) self.kernel_weights = nn.ModuleList( [nn.Linear(n_kernels, 1, bias=False) for w in win_size]) self.window_size = win_size self.window_scorer = [] for w in max_windows: l = nn.Linear(w, 1, bias=False) torch.nn.init.constant_(l.weight, 1 / w) self.window_scorer.append(l) self.window_scorer = nn.ModuleList(self.window_scorer) self.window_merger = nn.Linear(len(self.window_size), 1, bias=False)
def test_stacked_self_attention_can_run_foward_on_multiple_gpus(self): encoder = StackedSelfAttentionEncoder(input_dim=9, hidden_dim=12, projection_dim=9, feedforward_hidden_dim=5, num_layers=3, num_attention_heads=3).to(0) parallel_encoder = DataParallel(encoder, device_ids=[0, 1]) inputs = torch.randn([3, 5, 9]).to(0) encoder_output = parallel_encoder(inputs, None) assert list(encoder_output.size()) == [3, 5, 12]
def test_stacked_self_attention_can_run_foward(self): # Correctness checks are elsewhere - this is just stacking # blocks which are already well tested, so we just check shapes. encoder = StackedSelfAttentionEncoder(input_dim=9, hidden_dim=12, projection_dim=9, feedforward_hidden_dim=5, num_layers=3, num_attention_heads=3) inputs = Variable(torch.randn([3, 5, 9])) encoder_output = encoder(inputs, None) assert list(encoder_output.size()) == [3, 5, 12]
def __init__(self, _embsize: int, kernels_mu: List[float], kernels_sigma: List[float], att_heads: int, att_layer: int, att_proj_dim: int, att_ff_dim: int): super(TK_v1, self).__init__() n_kernels = len(kernels_mu) if len(kernels_mu) != len(kernels_sigma): raise Exception("len(kernels_mu) != len(kernels_sigma)") # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(kernels_mu), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False).view(1, 1, 1, n_kernels) self.nn_scaler = nn.Parameter( torch.full([1], 0.01, dtype=torch.float32, requires_grad=True)) self.mixer = nn.Parameter( torch.full([1, 1, 1], 0.5, dtype=torch.float32, requires_grad=True)) self.stacked_att = StackedSelfAttentionEncoder( input_dim=_embsize, hidden_dim=_embsize, projection_dim=att_proj_dim, feedforward_hidden_dim=att_ff_dim, num_layers=att_layer, num_attention_heads=att_heads, dropout_prob=0, residual_dropout_prob=0, attention_dropout_prob=0) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # bias is set to True in original code (we found it to not help, how could it?) self.dense = nn.Linear(n_kernels, 1, bias=False) self.dense_mean = nn.Linear(n_kernels, 1, bias=False) self.dense_comb = nn.Linear(2, 1, bias=False) # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo torch.nn.init.uniform_(self.dense_mean.weight, -0.014, 0.014) # inits taken from matchzoo # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int): super(AttentionSeq2Veq, self).__init__(stateful=False) self._seq2seq = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=hidden_dim, projection_dim=projection_dim, feedforward_hidden_dim=feedforward_hidden_dim, num_layers=num_layers, num_attention_heads=num_attention_heads) self._hidden_dim = hidden_dim self._input_dim = input_dim
def __init__(self, input_dim, hidden_dim, projection_dim, feedforward_hidden_dim, num_layers, num_attention_heads, stateful: bool = False) -> None: super().__init__(stateful) self.input_dim = input_dim self.hidden_dim = hidden_dim self.seq_2_seq = StackedSelfAttentionEncoder(input_dim=input_dim, hidden_dim=hidden_dim, projection_dim=projection_dim, feedforward_hidden_dim=feedforward_hidden_dim, num_layers=num_layers, num_attention_heads=num_attention_heads)
def __init__(self, args, dictionary, source_embedder: TextFieldEmbedder, left_pad=False): super().__init__(dictionary) self._seq2seq_encoder = StackedSelfAttentionEncoder(input_dim= int(source_embedder.get_output_dim()), hidden_dim= int(args.encoder_embed_dim), projection_dim= int(args.encoder_embed_dim / args.encoder_attention_heads), feedforward_hidden_dim= int(args.encoder_ffn_embed_dim), num_layers= int(args.encoder_layers), num_attention_heads= int(args.encoder_attention_heads), use_positional_encoding = True, dropout_prob = int(args.dropout), residual_dropout_prob = int(args.relu_dropout), attention_dropout_prob = int(args.attention_dropout)) self._source_embedder = source_embedder embed_dim = source_embedder.get_output_dim() self.embed_scale = math.sqrt(embed_dim) self._max_source_positions = args.max_source_positions
def __init__(self, input_size: int, hidden_size: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(TransformerSeq2VecEncoder, self).__init__() self.stacked_attention = StackedSelfAttentionEncoder( input_size, hidden_size, projection_dim, feedforward_hidden_dim, num_layers, num_attention_heads, use_positional_encoding, dropout_prob, residual_dropout_prob, attention_dropout_prob) self.input_dim = input_size self.output_dim = self.stacked_attention._attention_layers[ -1].get_output_dim()
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, upsample: torch.nn.Module = None, net: Seq2SeqEncoder = None, target_namespace: str = "target_tokens", target_embedding_dim: int = None, use_bleu: bool = True, loss_type: str = "ctc", label_smoothing: float = None, ) -> None: super(LatentAignmentCTC, self).__init__(vocab) self._target_namespace = target_namespace self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._blank_index = self.vocab.get_token_index(SPECIAL_BLANK_TOKEN, self._target_namespace) if use_bleu: self._bleu = BLEU(exclude_indices={self._pad_index, self._blank_index}) else: self._bleu = None self._source_embedder = source_embedder source_embedding_dim = source_embedder.get_output_dim() self._upsample = upsample or LinearUpsample(source_embedding_dim, s = 3) self._net = net or StackedSelfAttentionEncoder(input_dim = source_embedding_dim, hidden_dim = 128, projection_dim = 128, feedforward_hidden_dim = 512, num_layers = 4, num_attention_heads = 4) num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = self._net.get_output_dim() self._output_projection = torch.nn.Linear(target_embedding_dim, num_classes) self.loss_type = loss_type self.label_smoothing = label_smoothing
def __init__(self, input_dim: int, num_head: int = 3, bert_self_attn_layers: BertSelfAttnLayers = None) -> None: super().__init__() self._global_attention = TimeDistributed(torch.nn.Linear(input_dim, 1)) self._num_heads = num_head self._span_token_emb = nn.Parameter(torch.Tensor(input_dim)) nn.init.normal_(self._span_token_emb) if bert_self_attn_layers is not None: self._input_dim = self._output_dim = input_dim self._stacked_self_attention = bert_self_attn_layers else: self._input_dim = input_dim self._output_dim = input_dim // 4 self._stacked_self_attention = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=self._output_dim, projection_dim=self._output_dim, feedforward_hidden_dim=4 * self._output_dim, num_layers=2, num_attention_heads=1, use_positional_encoding=True)
train_dataset = reader.read('/.../en_el_train.txt') validation_dataset = reader.read('/.../en_el_dev.txt') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 800 model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM,
def __init__(self, idiom_vector_path: str, dropout: float, vocab: Vocabulary, content_embedder: TextFieldEmbedder, use_pretrained: bool = False, use_reasoner: bool = False, idiom_vector_size: int = None, reasoner_mode: str = None) -> None: super().__init__(vocab) self.content_embedder = content_embedder if idiom_vector_size is not None and use_pretrained: raise ValueError( "When `use_pretrained` is True, `idiom_vector_size` must be None." ) idiom_list, idiom_vectors = [], [] with open(idiom_vector_path) as fh: for line in fh: idiom_list.append(line.strip().split()[0]) idiom_vectors.append(list(map(float, line.strip().split()[1:]))) self.use_pretrained = use_pretrained if self.use_pretrained: self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=len(idiom_vectors[0]), projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 weight=torch.FloatTensor(idiom_vectors)) else: embedding_dim = idiom_vector_size or len(idiom_vectors[0]) self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=embedding_dim, projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 # weight=torch.FloatTensor(idiom_vectors) ) self.dropout = nn.Dropout(dropout) self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1) self.use_reasoner = use_reasoner if use_reasoner: embedding_size = self.content_embedder.get_output_dim() if reasoner_mode is None: reasoner_mode = 'self_attention' else: reasoner_mode = reasoner_mode.lower() assert reasoner_mode in ('self_attention', 'gated_self_attention') self.reasoner_mode = reasoner_mode if reasoner_mode == 'self_attention': self.option_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) elif reasoner_mode == "gated_self_attention": self.option_reasoner = GatedSelfAttention( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2) self.loss = nn.CrossEntropyLoss() self.acc = CategoricalAccuracy()
def __init__(self, idiom_vector_path: str, dropout: float, vocab: Vocabulary, content_embedder: TextFieldEmbedder, use_pretrained: bool = False, use_reasoner: bool = False, idiom_vector_size: int = None, denoise_mode: str = 'soft', denoise_lambda: float = None, teacher_model_path: str = None, teacher_mode: str = None) -> None: super().__init__(vocab) self.content_embedder = content_embedder if idiom_vector_size is not None and use_pretrained: raise ValueError( "When `use_pretrained` is True, `idiom_vector_size` must be None." ) if teacher_mode is not None: teacher_mode = teacher_mode.lower() assert teacher_mode in ('initialization', 'teacher'), ( f'teacher_mode ({teacher_mode}) ' 'not in ("initialization", "teacher").') if teacher_mode is not None and teacher_model_path is None: raise ValueError( "Please set teacher_model_path when teacher_mode is not None.") self.teacher_mode = teacher_mode self.teacher_model_path = teacher_model_path self.teacher = self.load_teacher() idiom_list, idiom_vectors = [], [] with open(idiom_vector_path) as fh: for line in fh: idiom_list.append(line.strip().split()[0]) idiom_vectors.append(list(map(float, line.strip().split()[1:]))) self.use_pretrained = use_pretrained if self.use_pretrained: self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=len(idiom_vectors[0]), projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 weight=torch.FloatTensor(idiom_vectors)) else: embedding_dim = idiom_vector_size or len(idiom_vectors[0]) self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=embedding_dim, projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 # weight=torch.FloatTensor(idiom_vectors) ) self.dropout = nn.Dropout(dropout) self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1) if use_reasoner: logger.info(f"{type(self)} always uses the reasoner.") self.use_reasoner = True embedding_size = self.content_embedder.get_output_dim() self.option_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) if self.teacher_mode == 'initialization': self.option_embedder.weight = self.teacher.option_embedder.weight self.option_embedder.weight.requires_grad = True self.scorer.weight = self.teacher.scorer.weight self.scorer.weight.requires_grad = True denoise_mode = denoise_mode.lower() assert denoise_mode in ('soft', 'hard', 'both', 'lambda'), ( f'denoise_mode ({denoise_mode}) ' 'not in ("soft", "hard", "both", "lambda").') self.denoise_mode = denoise_mode self.denoise_lambda = denoise_lambda self.loss = nn.CrossEntropyLoss() self.acc = CategoricalAccuracy()
def __init__(self, idiom_vector_path: str, dropout: float, vocab: Vocabulary, content_embedder: TextFieldEmbedder, option_vector_encoder: Seq2VecEncoder, use_pretrained: bool = False, use_idiom_embedding: bool = True, use_idiom_text: bool = False, use_idiom_definition: bool = False, use_reasoner: bool = False, idiom_vector_size: int = None) -> None: super().__init__(vocab) if idiom_vector_size is not None and use_pretrained: raise ValueError( "When `use_pretrained` is True, `idiom_vector_size` must be None." ) if not use_idiom_embedding and use_pretrained: raise ValueError( "use_pretrained=True but use_idiom_embedding=False.") # suppose to be BERT model self.content_embedder = content_embedder self.option_vector_encoder = option_vector_encoder self.use_idiom_embedding = use_idiom_embedding if self.use_idiom_embedding: idiom_list, idiom_vectors = [], [] with open(idiom_vector_path) as fh: for line in fh: idiom_list.append(line.strip().split()[0]) idiom_vectors.append( list(map(float, line.strip().split()[1:]))) self.use_pretrained = use_pretrained if self.use_pretrained: self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=len(idiom_vectors[0]), projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 weight=torch.FloatTensor(idiom_vectors)) else: embedding_dim = idiom_vector_size or len(idiom_vectors[0]) self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=embedding_dim, projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 # weight=torch.FloatTensor(idiom_vectors) ) self.option_vector_encoder = option_vector_encoder if self.use_idiom_embedding: idiom_merger_in_features = self.option_embedder.get_output_dim() else: idiom_merger_in_features = 0 self.use_idiom_text = use_idiom_text if self.use_idiom_text: idiom_merger_in_features += self.option_vector_encoder.get_output_dim( ) self.use_idiom_definition = use_idiom_definition if self.use_idiom_definition: idiom_merger_in_features += self.option_vector_encoder.get_output_dim( ) self.option_merger = nn.Linear( in_features=idiom_merger_in_features, out_features=self.content_embedder.get_output_dim(), bias=True) self.dropout = nn.Dropout(dropout) self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1) self.use_reasoner = use_reasoner if use_reasoner: embedding_size = self.content_embedder.get_output_dim() self.option_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) self.loss = nn.CrossEntropyLoss() self.acc = CategoricalAccuracy()
def __init__(self, args, input_dim, hidden_dim, word_embedder): super(RelationAttendedDefinitionSentenceEncoder, self).__init__() self.config = args self.args = args self.input_dim = input_dim self.hidden_dim = hidden_dim self.projection_dim = input_dim self.feedforward_hidden_dim = input_dim self.num_layers = self.args.num_layers_for_stackatt self.num_attention_heads = self.args.num_atthead_for_stackatt self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) # from allennlp.modules.seq2seq_encoders import , , \ # , , # BidirectionalLanguageModelTransformer, FeedForwardEncoder if self.args.definition_seq2seq == 'passthrough': self.seq2seq = PassThroughEncoder(input_dim=input_dim) elif self.args.definition_seq2seq == 'multiheadstackatt': self.seq2seq = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=input_dim, projection_dim=input_dim, feedforward_hidden_dim=input_dim, num_layers=2, num_attention_heads=2) elif self.args.definition_seq2seq == 'qanet': self.seq2seq = QaNetEncoder(input_dim=input_dim, hidden_dim=input_dim, attention_projection_dim=input_dim, feedforward_hidden_dim=input_dim, num_blocks=2, num_convs_per_block=2, conv_kernel_size=3, num_attention_heads=2) elif self.args.definition_seq2seq == 'intrasentenceatt': self.seq2seq = IntraSentenceAttentionEncoder( input_dim=input_dim, projection_dim=input_dim, output_dim=input_dim) elif self.args.definition_seq2seq == 'gatedcnn': self.seq2seq = GatedCnnEncoder(input_dim=512, layers=[[[4, 512]], [[4, 512], [4, 512]], [[4, 512], [4, 512]], [[4, 512], [4, 512]]], dropout=0.05) elif self.args.definition_seq2seq == 'bilmtransformer': self.seq2seq = BidirectionalLanguageModelTransformer( input_dim=input_dim, hidden_dim=input_dim, num_layers=2) # elif self.args.definition_seq2seq == 'feedfoward': # feedforward = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=input_dim, activations=self.args.activation_for_sentence_ff) # self.seq2seq = FeedForwardEncoder(feedforward) # ''' # *"linear" # *`"relu" < https: // pytorch.org / docs / master / nn.html # torch.nn.ReLU>`_ # *`"relu6" < https: // pytorch.org / docs / master / nn.html # torch.nn.ReLU6>`_ # *`"elu" < https: // pytorch.org / docs / master / nn.html # torch.nn.ELU>`_ # *`"prelu" < https: // pytorch.org / docs / master / nn.html # torch.nn.PReLU>`_ # *`"leaky_relu" < https: // pytorch.org / docs / master / nn.html # torch.nn.LeakyReLU>`_ # *`"threshold" < https: // pytorch.org / docs / master / nn.html # torch.nn.Threshold>`_ # *`"hardtanh" < https: // pytorch.org / docs / master / nn.html # torch.nn.Hardtanh>`_ # *`"sigmoid" < https: // pytorch.org / docs / master / nn.html # torch.nn.Sigmoid>`_ # *`"tanh" < https: // pytorch.org / docs / master / nn.html # torch.nn.Tanh>`_ # *`"log_sigmoid" < https: // pytorch.org / docs / master / nn.html # torch.nn.LogSigmoid>`_ # *`"softplus" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softplus>`_ # *`"softshrink" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softshrink>`_ # *`"softsign" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softsign>`_ # *`"tanhshrink" < https: // pytorch.org / docs / master / nn.html # torch.nn.Tanhshrink>`_ # ''' elif self.args.definition_seq2seq == 'multiheadselfatt': self.seq2seq = MultiHeadSelfAttention( num_heads=2, input_dim=input_dim, output_projection_dim=input_dim, attention_dim=input_dim, values_dim=input_dim) else: print('Encoder not defined:', self.args.definition_seq2seq) exit()
train_dataset = reader.read(cached_path("data/train")) validation_dataset = reader.read(cached_path("data/dev")) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=100, pretrained_file="https://s3-us-west-2.amazonaws.com/allennlp/" "datasets/glove/glove.6B.100d.txt.gz", trainable=True) source_embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) # encoder = PytorchSeq2SeqWrapper(StackedAlternatingLstm(input_size=200, # hidden_size=300, # num_layers=4, # recurrent_dropout_probability=0.1, # use_highway=True)) encoder = StackedSelfAttentionEncoder( input_dim=200, hidden_dim=300, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq max_decoding_steps = 50 # TODO: make this variable ZH_EMBEDDING_DIM = 10 model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='labels', attention=attention, beam_size=8,
def __init__(self, idiom_vector_path: str, idiom_graph_path: str, dropout: float, vocab: Vocabulary, content_embedder: TextFieldEmbedder, use_pretrained: bool = False, idiom_vector_size: int = None, neighbor_num: int = 7, num_neighbour_attention_heads: int = 2) -> None: super().__init__(vocab) self.content_embedder = content_embedder if idiom_vector_size is not None and use_pretrained: raise ValueError( "When `use_pretrained` is True, `idiom_vector_size` must be None." ) idiom_list, idiom_vectors = [], [] with open(idiom_vector_path) as fh: for line in fh: idiom_list.append(line.strip().split()[0]) idiom_vectors.append(list(map(float, line.strip().split()[1:]))) self.graph_embedder = GraphEmbedder(idiom_graph_path, neighbor_num=neighbor_num, drop_neighbor=False) self.use_pretrained = use_pretrained if self.use_pretrained: self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=len(idiom_vectors[0]), projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 weight=torch.FloatTensor(idiom_vectors)) else: embedding_dim = idiom_vector_size or len(idiom_vectors[0]) self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=embedding_dim, projection_dim=self.content_embedder.get_output_dim(), # 使用 预训练的成语向量 # weight=torch.FloatTensor(idiom_vectors) ) self.dropout = nn.Dropout(dropout) self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1) embedding_size = self.content_embedder.get_output_dim() self.neighbour_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=num_neighbour_attention_heads, use_positional_encoding=False) self.option_encoder = FirstVecEncoder(embedding_dim=embedding_size) self.option_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) self.data_merger = FeedForward( input_dim=embedding_size + embedding_size + embedding_size, num_layers=1, hidden_dims=embedding_size, activations=Activation.by_name('linear')(), dropout=0.1) self.loss = nn.CrossEntropyLoss() self.acc = CategoricalAccuracy()
def __init__(self, idiom_vector_path: str, idiom_graph_path: str, dropout: float, vocab: Vocabulary, content_embedder: TextFieldEmbedder, neighbor_num: int = 7, mode: List[str] = None) -> None: super().__init__(vocab) self.content_embedder = content_embedder idiom_list, idiom_vectors = [], [] with open(idiom_vector_path) as fh: for line in fh: idiom_list.append(line.strip().split()[0]) idiom_vectors.append(list(map(float, line.strip().split()[1:]))) self.graph_embedder = GraphEmbedder(idiom_graph_path, neighbor_num=neighbor_num, drop_neighbor=False) embedding_dim = self.content_embedder.get_output_dim() self.option_embedder = modules.Embedding( num_embeddings=len(idiom_list), embedding_dim=embedding_dim, # 使用 预训练的成语向量 # weight=torch.FloatTensor(idiom_vectors) ) self.dropout = nn.Dropout(dropout) self.scorer = nn.Linear(self.content_embedder.get_output_dim(), 1) embedding_size = self.content_embedder.get_output_dim() self.neighbour_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) self.option_encoder = FirstVecEncoder(embedding_dim=embedding_size) self.option_reasoner = StackedSelfAttentionEncoder( input_dim=embedding_size, hidden_dim=embedding_size, projection_dim=embedding_size, feedforward_hidden_dim=embedding_size, num_layers=1, num_attention_heads=2, use_positional_encoding=False) if mode is None: mode = ['raw', 'ocn', 'nn'] else: for item in mode: assert item in ['raw', 'ocn', 'nn'], f"{item} is invalid" self.mode = mode self.data_merger = FeedForward( input_dim=embedding_size * len(mode), num_layers=1, hidden_dims=embedding_size, activations=Activation.by_name('linear')(), dropout=0.1) self.loss = nn.CrossEntropyLoss() self.acc = CategoricalAccuracy()