def __init__(self, config, input_dim, num_layers, num_classes, encoder_dim=None, bert_pretrained=True, bert_pretrained_model_name='bert-base-cased'): super().__init__() self.bert = get_bert(bert_pretrained, bert_pretrained_model_name) self.bert_asr = get_bert(bert_pretrained, bert_pretrained_model_name) self.aux_embedding = nn.Linear(config.enc_dim, self.bert.config.hidden_size ) #bert_hidden_size = 768 enc_dim = 128 self.lugosch_model = lugosch.models.PretrainedModel(config) pretrained_model_path = os.path.join(config.libri_folder, "libri_pretraining", "model_state.pth") self.lugosch_model.load_state_dict(torch.load(pretrained_model_path)) self.config = config # freeze phoneme and word layers self.freeze_all_layers() self.unfreezing_index = 1 self.maxpool = MaskedMaxPool() self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
def __init__(self, num_classes, pretrained=True): super().__init__() self.bert = get_bert(pretrained) self.classifier = nn.Sequential( nn.Dropout(0.3), nn.Linear(self.bert.config.hidden_size, num_classes) )
def __init__(self, config, input_dim, num_layers, num_classes, encoder_dim=None, bert_pretrained=True, bert_pretrained_model_name='bert-base-cased'): super().__init__() self.bert = get_bert(bert_pretrained, bert_pretrained_model_name) #model 3 is bert trained on GT and ASR if config.bert_dir: print( f"loading model3 (bert pretrained on GT and ASR) from {config.bert_dir}" ) chkpt_path = os.path.join(config.bert_dir, 'best_ckpt.pth') model_dict = self.bert.state_dict() pretrained_dict = torch.load(chkpt_path) pretrained_dict = { k.split(".", 1)[1]: v for k, v in pretrained_dict.items() if k.split(".", 1)[1] in model_dict } self.bert.load_state_dict(pretrained_dict) ### Comment out Alexa's encoder self.aux_embedding = nn.Linear(config.enc_dim, self.bert.config.hidden_size ) #bert_hidden_size = 768 enc_dim = 128 self.lugosch_model = lugosch.models.PretrainedModel(config) pretrained_model_path = os.path.join(config.libri_folder, "libri_pretraining", "model_state.pth") self.lugosch_model.load_state_dict(torch.load(pretrained_model_path)) self.config = config # freeze phoneme and word layers self.freeze_all_layers() self.unfreezing_index = 1 self.maxpool = MaskedMaxPool() self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
def __init__(self, input_dim, num_layers, num_classes, encoder_dim=None, bert_pretrained=True, bert_pretrained_model_name='bert-base-cased'): super().__init__() self.bert = get_bert(bert_pretrained, bert_pretrained_model_name) self.encoder_dim = encoder_dim if encoder_dim is None: self.speech_encoder = SubsampledBiLSTMEncoder( input_dim=input_dim, encoder_dim=self.bert.config.hidden_size // 2, num_layers=num_layers) else: self.speech_encoder = SubsampledBiLSTMEncoder( input_dim=input_dim, encoder_dim=encoder_dim, num_layers=num_layers) self.aux_embedding = nn.Linear(2 * encoder_dim, self.bert.config.hidden_size) self.maxpool = MaskedMaxPool() self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
def __init__(self, config, input_dim, num_layers, num_classes, encoder_dim=None, bert_pretrained=True, bert_pretrained_model_name='bert-base-cased'): super().__init__() self.bert = get_bert(bert_pretrained, bert_pretrained_model_name) ### Comment out Alexa's encoder # self.encoder_dim = encoder_dim # if encoder_dim is None: # self.speech_encoder = SubsampledBiLSTMEncoder(input_dim=input_dim, encoder_dim=self.bert.config.hidden_size//2, num_layers=num_layers) # else: # self.speech_encoder = SubsampledBiLSTMEncoder(input_dim=input_dim, encoder_dim=encoder_dim, num_layers=num_layers) self.aux_embedding = nn.Linear(config.enc_dim, self.bert.config.hidden_size ) #bert_hidden_size = 768 enc_dim = 128 self.aux_reverse = nn.Linear( self.bert.config.hidden_size, config.enc_dim) #match bert hidden size with Lugosch's self.lugosch_model = lugosch.models.PretrainedModel(config) pretrained_model_path = os.path.join(config.libri_folder, "libri_pretraining", "model_state.pth") self.lugosch_model.load_state_dict(torch.load(pretrained_model_path)) self.config = config # freeze phoneme and word layers self.freeze_all_layers() self.unfreezing_index = 1 self.maxpool = MaskedMaxPool() self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes) # Based on Lugosch's Intent Module (class Model in models.py) self.intent_layers = [] self.num_values_total = num_classes #default for fluentai self.num_rnn_layers = len(config.intent_rnn_num_hidden) self.out_dim = config.word_rnn_num_hidden[-1] if config.word_rnn_bidirectional: self.out_dim *= 2 for idx in range(self.num_rnn_layers): # recurrent print("config.intent_rnn_bidirectional :", config.intent_rnn_bidirectional) layer = torch.nn.GRU(input_size=self.out_dim, hidden_size=config.intent_rnn_num_hidden[idx], batch_first=True, bidirectional=config.intent_rnn_bidirectional) layer.name = "intent_rnn%d" % idx self.intent_layers.append(layer) self.out_dim = config.intent_rnn_num_hidden[idx] if config.intent_rnn_bidirectional: self.out_dim *= 2 # grab hidden states of RNN for each timestep layer = RNNSelect() layer.name = "intent_rnn_select%d" % idx self.intent_layers.append(layer) # dropout layer = torch.nn.Dropout(p=config.intent_rnn_drop[idx]) layer.name = "intent_dropout%d" % idx self.intent_layers.append(layer) # downsample layer = Downsample(method=config.intent_downsample_type[idx], factor=config.intent_downsample_len[idx], axis=1) layer.name = "intent_downsample%d" % idx self.intent_layers.append(layer) layer = torch.nn.Linear(self.out_dim, self.num_values_total) layer.name = "final_classifier" self.intent_layers.append(layer) layer = FinalPool() #maxpool 3D - 2D layer.name = "final_pool" self.intent_layers.append(layer) self.lugosch_classifier = torch.nn.ModuleList(self.intent_layers)