def __init__(self, vocab: Vocabulary, task: str, encoder: Seq2SeqEncoder, png_params_dim: int, label_smoothing: float = 0.0, dropout: float = 0.0, adaptive: bool = False, features: List[str] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(TagDecoder, self).__init__(vocab, regularizer) self.dropout = torch.nn.Dropout(p=dropout) self.task = task self.encoder = encoder self.output_dim = encoder.get_output_dim() self.label_smoothing = label_smoothing self.num_classes = self.vocab.get_vocab_size(task) self.adaptive = adaptive self.features = [ f.replace('[', '_').replace(']', '_') for f in features ] if features else [] self.metrics = { "acc": CategoricalAccuracy(), # "acc3": CategoricalAccuracy(top_k=3) } if self.adaptive: # TODO adaptive_cutoffs = [ round(self.num_classes / 15), 3 * round(self.num_classes / 15) ] self.task_output = AdaptiveLogSoftmaxWithLoss( self.output_dim, self.num_classes, cutoffs=adaptive_cutoffs, div_value=4.0) else: self.task_output = TimeDistributedPGN( LinearWithPGN(png_params_dim, self.output_dim, self.num_classes)) self.feature_outputs = torch.nn.ModuleDict() self.features_metrics = {} for feature in self.features: self.feature_outputs[feature] = TimeDistributedPGN( LinearWithPGN(png_params_dim, self.output_dim, vocab.get_vocab_size(feature))) self.features_metrics[feature] = { "acc": CategoricalAccuracy(), } initializer(self)
def get_calc(context): if self.model_params.use_hardcoded_cutoffs: vocab_size = self.entity_embeds.weight.shape[0] cutoffs = self.model_params.adaptive_softmax_cutoffs else: raise NotImplementedError in_features = self.entity_embeds.weight.shape[1] n_classes = self.entity_embeds.weight.shape[0] return AdaptiveLogSoftmaxWithLoss(in_features, n_classes, cutoffs, div_value=1.0).to(self.device)
def __init__(self, num_embeddings, embedding_dim, padding_idx, conv_filters, n_highways, projection_size, vocab_size): super(ELMoNet, self).__init__() self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim self.padding_idx = padding_idx self.conv_filters = conv_filters self.n_highways = n_highways self.projection_size = projection_size self.char_embedding = CharEmbedding(self.num_embeddings, self.embedding_dim, self.padding_idx, self.conv_filters, self.n_highways, self.projection_size) self.hidden_size = 2048 self.lstm1f = nn.LSTM(self.projection_size, self.hidden_size, 1, batch_first=True) self.lstm2f = nn.LSTM(self.projection_size, self.hidden_size, 1, batch_first=True) self.lstm1r = nn.LSTM(self.projection_size, self.hidden_size, 1, batch_first=True) self.lstm2r = nn.LSTM(self.projection_size, self.hidden_size, 1, batch_first=True) self.linear1f = nn.Linear(self.hidden_size, self.projection_size) self.linear1r = nn.Linear(self.hidden_size, self.projection_size) self.linear2f = nn.Linear(self.hidden_size, self.projection_size) self.linear2r = nn.Linear(self.hidden_size, self.projection_size) self.adap_loss = AdaptiveLogSoftmaxWithLoss(self.projection_size, vocab_size, [10, 100, 1000])
def __init__(self, input_dim, num_classes, label_smoothing: float = 0.03, adaptive: bool = False) -> None: super(TagDecoder, self).__init__() self.label_smoothing = label_smoothing self.num_classes = num_classes self.adaptive = adaptive if self.adaptive: adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)] self.task_output = AdaptiveLogSoftmaxWithLoss(input_dim, self.num_classes, cutoffs=adaptive_cutoffs, div_value=4.0) else: self.task_output = Linear(self.output_dim, self.num_classes)
def __init__(self, vocab: Vocabulary, task: str, encoder: Seq2SeqEncoder, lang_embed_dim: int = None, use_lang_feedforward: bool = False, lang_feedforward: FeedForward = None, label_smoothing: float = 0.0, dropout: float = 0.0, adaptive: bool = False, features: List[str] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(TagDecoder, self).__init__(vocab, regularizer) self.lang_embedding = None if lang_embed_dim is not None: self.lang_embedding = Embedding(self.vocab.get_vocab_size("langs"), lang_embed_dim) self.dropout = torch.nn.Dropout(p=dropout) self.task = task self.encoder = encoder self.output_dim = encoder.get_output_dim() self.label_smoothing = label_smoothing self.num_classes = self.vocab.get_vocab_size(task) self.adaptive = adaptive self.features = features if features else [] self.use_lang_feedforward = use_lang_feedforward if self.lang_embedding is not None and use_lang_feedforward: self.lang_feedforward = lang_feedforward or \ FeedForward(self.output_dim, 1, self.output_dim, Activation.by_name("elu")()) self.metrics = { "acc": CategoricalAccuracy(), # "acc3": CategoricalAccuracy(top_k=3) } if self.adaptive: # TODO adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)] self.task_output = AdaptiveLogSoftmaxWithLoss(self.output_dim, self.num_classes, cutoffs=adaptive_cutoffs, div_value=4.0) else: self.task_output = TimeDistributed(Linear(self.output_dim, self.num_classes)) self.feature_outputs = torch.nn.ModuleDict() self.features_metrics = {} for feature in self.features: self.feature_outputs[feature] = TimeDistributed(Linear(self.output_dim, vocab.get_vocab_size(feature))) self.features_metrics[feature] = { "acc": CategoricalAccuracy(), } initializer(self)
def __init__(self, vocab: Vocabulary, task: str, encoder: Seq2SeqEncoder, prev_task: str, prev_task_embed_dim: int = None, label_smoothing: float = 0.0, dropout: float = 0.0, adaptive: bool = False, features: List[str] = None, metric: str = "acc", loss_weight: float = 1.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(TagDecoder, self).__init__(vocab, regularizer) self.task = task self.dropout = torch.nn.Dropout(p=dropout) self.encoder = encoder self.output_dim = encoder.get_output_dim() self.label_smoothing = label_smoothing self.num_classes = self.vocab.get_vocab_size(task) self.adaptive = adaptive self.features = features if features else [] self.metric = metric self.loss_weight = loss_weight # A: add all possible relative encoding to vocabulary if self.vocab.get_token_index('100,root') == 1: for head in self.vocab.get_token_to_index_vocabulary('head_tags').keys(): all_encodings = get_all_relative_encodings(head) self.vocab.add_tokens_to_namespace(tokens=all_encodings, namespace='dep_encoded') # make sure to put end token '100,root' self.vocab.add_token_to_namespace(token='100,root', namespace='dep_encoded') self.prev_task_tag_embedding = None if prev_task_embed_dim is not None and prev_task_embed_dim is not 0 and prev_task is not None: if not prev_task == 'rependency': self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size(prev_task), prev_task_embed_dim) else: self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size('dep_encoded'), prev_task_embed_dim) # Choose the metric to use for the evaluation (from the defined # "metric" value of the task). If not specified, default to accuracy. if self.metric == "acc": self.metrics = {"acc": CategoricalAccuracy()} elif self.metric == "span_f1": self.metrics = {"span_f1": SpanBasedF1Measure( self.vocab, tag_namespace=self.task, label_encoding="BIO")} else: logger.warning(f"ERROR. Metric: {self.metric} unrecognized. Using accuracy instead.") self.metrics = {"acc": CategoricalAccuracy()} if self.adaptive: # TODO adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)] self.task_output = AdaptiveLogSoftmaxWithLoss(self.output_dim, self.num_classes, cutoffs=adaptive_cutoffs, div_value=4.0) else: self.task_output = TimeDistributed(Linear(self.output_dim, self.num_classes)) self.feature_outputs = torch.nn.ModuleDict() self.features_metrics = {} for feature in self.features: self.feature_outputs[feature] = TimeDistributed(Linear(self.output_dim, vocab.get_vocab_size(feature))) self.features_metrics[feature] = { "acc": CategoricalAccuracy(), } initializer(self)
def train(train_dataset, train_dataset_reverse, config_path, char_lexicon, word_lexicon, batch_size, learning_rate, device, max_epoch, output_dir): with open(config_path, 'r') as f: config = json.load(f) forward_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) backward_loader = DataLoader(train_dataset_reverse, batch_size=batch_size, shuffle=False) # word_label_loader = num_embeddings = len(char_lexicon) padding_idx = char_lexicon['<pad>'] model = ELMoNet(num_embeddings, config['embedding_dim'], padding_idx, config['filters'], config['n_highways'], config['projection_size']) model.train(True) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cutoffs = [100, 1000, 10000] word_lexicon_size = len(word_lexicon) adap_loss = AdaptiveLogSoftmaxWithLoss(config['projection_size'], word_lexicon_size, cutoffs) adap_loss = adap_loss.to(device) trange = tqdm(enumerate(zip(forward_loader, backward_loader)), total=len(forward_loader), desc='training', ascii=True) for epoch in range(max_epoch): loss = 0 epoch_log = {} for i, ((forward_batch, forward_label), (backward_batch, backward_label)) in trange: optimizer.zero_grad() forward_feature, backward_feature = \ model.forward(forward_batch.to(device), backward_batch.to(device)) forward_feature = forward_feature.view(-1, forward_feature.size()[2]) forward_label = forward_label.view(forward_label.size()[0] * forward_label.size()[1]) forward_output, forward_loss = \ adap_loss(forward_feature, forward_label.to(device)) backward_feature = backward_feature.view( -1, backward_feature.size()[2]) backward_label = backward_label.view(backward_label.size()[0] * backward_label.size()[1]) backward_output, backward_loss = \ adap_loss(backward_feature, backward_label.to(device)) forward_loss.backward() backward_loss.backward() optimizer.step() loss += (forward_loss.item() + backward_loss.item()) / 2 trange.set_postfix(loss=loss / (i + 1)) loss /= len(forward_loader) epoch_log["epoch{}".format(epoch)] = loss print("epoch=%f\n" % epoch) print("loss=%f\n" % loss) save_model(model, epoch, output_dir) save_log(epoch_log, output_dir)