Пример #1
0
 def test_fails_no_distances(self):
     dataloader = self.setup_mini_dataset()
     config = CodeTransformerCoreConfig(
         encoder_layer=CodeTransformerLayerConfig(d_model=512,
                                                  nhead=8,
                                                  dim_feedforward=2048,
                                                  activation="gelu",
                                                  num_relative_distances=0,
                                                  use_token_distances=False,
                                                  use_content_content=True,
                                                  use_content_pos=True,
                                                  use_pos_content=True,
                                                  use_pos_pos=True),
         num_layers=4,
     )
     language_model_config = TransformerLMDecoderConfig(
         lm_encoder=TransformerLMEncoderConfig(
             config,
             vocab_size=len(self.word_vocab.vocabulary),
             num_node_types=len(self.node_type_vocab.vocabulary),
             num_token_types=len(self.token_type_vocab.vocabulary)),
         sos_id=-1,
     )
     with self.assertRaises(Exception):
         transformer_lm = TransformerLanguageModel(
             transformer_lm_encoder=language_model_config['lm_encoder'],
             output_nonlinearity=language_model_config[
                 'output_nonlinearity'],
             loss_fct=language_model_config['loss_fct'])
         batch: CTBatch = next(iter(dataloader))
         transformer_lm.forward_batch(batch)
Пример #2
0
    def __init__(self, config: TransformerLMDecoderConfig):
        if not isinstance(config.lm_encoder, nn.Module):
            config.transformer_lm_encoder = GreatEncoder(
                GreatEncoderConfig(**config.lm_encoder))

        config.lm_encoder.d_model = config.lm_encoder.transformer.hidden_dim

        super(GreatTransformerDecoder, self).__init__(config)
Пример #3
0
    def load_model(self, run_id, snapshot_iteration, gpu=True):
        model_params = self.load_parameters(run_id, snapshot_iteration, gpu=gpu)
        config = self.load_config(run_id)
        model_config = self._prepare_model_config(config)

        language = config['data_setup']['language']
        data_manager = CTPreprocessedDataManager(DATA_PATH_STAGE_2, language)

        decoder_config = model_config['lm_decoder']

        word_vocab, token_type_vocab, node_type_vocab = data_manager.load_vocabularies()

        transformer_encoder_config = model_config['lm_encoder']
        transformer_encoder_config['num_token_types'] = len(token_type_vocab)
        transformer_encoder_config['vocab_size'] = len(word_vocab)

        decoder_config['sos_id'] = word_vocab[SOS_TOKEN]
        if 'num_subtokens_output' in config['data_setup']:
            decoder_config['output_subtokens_per_token'] = config['data_setup']['num_subtokens_output']
        else:
            decoder_config['output_subtokens_per_token'] = NUM_SUB_TOKENS

        if 'use_pointer_network' in config['data_setup']:
            decoder_config['use_pointer_network'] = config['data_setup']['use_pointer_network']

        decoder_config['lm_encoder'] = transformer_encoder_config
        decoder_config['loss_fct'] = model_config['loss_fct']

        model = XLNetTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))

        try:
            model.load_state_dict(model_params)
        except RuntimeError:
            # In most cases, this is due to the legacy issue with encoder_self_attention
            model.add_module('encoder_self_attention',
                             MultiheadAttention(model.d_model, decoder_config['decoder_nhead'],
                                                dropout=decoder_config['decoder_dropout']))
            try:
                model.load_state_dict(model_params)
            except RuntimeError:
                decoder_config['concat_query_and_pointer'] = False
                model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))
                model.load_state_dict(model_params)

        return model
Пример #4
0
        def init_model():
            encoder_config['transformer'] = transformer_config
            decoder_config['lm_encoder'] = XLNetLMEncoder(
                TransformerLMEncoderConfig(**encoder_config))
            model = XLNetTransformerDecoder(
                TransformerLMDecoderConfig(**decoder_config))

            num_params = sum(
                [len(params.view(-1)) for params in model.parameters()])
            print(f"Model has {num_params} parameters")

            return model
Пример #5
0
        def init_model():
            encoder_config['transformer_config'] = GreatTransformerConfig(
                **transformer_config)
            decoder_config['lm_encoder'] = GreatEncoderTransformerAdapter(
                GreatEncoderConfig(**encoder_config))
            model = GreatTransformerDecoder(
                TransformerLMDecoderConfig(**decoder_config))

            num_params = sum(
                [len(params.view(-1)) for params in model.parameters()])
            print(f"Model has {num_params} parameters")

            return model
    def _init_model(self,
                    lm_encoder: dict,
                    lm_decoder: dict,
                    with_cuda: bool,
                    label_smoothing=None):

        transformer_config = GreatTransformerConfig(
            **lm_encoder['transformer_config'])

        config = GreatEncoderConfig(**lm_encoder)

        num_edge_types = 0
        for d in self.relative_distances:
            if d in ["ancestor_sp", "sibling_sp"]:
                num_edge_types += 2
            elif d == "shortest_paths":
                num_edge_types += 1
        transformer_config.bias_dim = num_edge_types
        config.transformer_config = transformer_config

        if hasattr(self, 'word_vocab'):
            config.vocab_size = len(self.word_vocab.vocabulary)
        if hasattr(self, 'node_type_vocab'):
            config.num_node_types = len(self.node_type_vocab.vocabulary)
        if hasattr(self, "num_sub_tokens"):
            config.subtokens_per_token = self.num_sub_tokens
        if hasattr(self, 'num_languages'):
            config.num_languages = self.num_languages

        great_lm_encoder = GreatEncoderTransformerAdapter(config)

        if label_smoothing is None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
        else:
            loss_fct = LabelSmoothingLoss(label_smoothing)

        model_config = TransformerLMDecoderConfig(
            great_lm_encoder,
            sos_id=self.word_vocab[SOS_TOKEN],
            unk_id=self.word_vocab[UNKNOWN_TOKEN],
            loss_fct=loss_fct,
            output_subtokens_per_token=self.dataset_train.
            num_sub_tokens_output,
            use_pointer_network=self.use_pointer_network if hasattr(
                self, "use_pointer_network") else False,
            **lm_decoder)
        self.model_manager = GreatModelManager()
        self.model_lm = GreatTransformerDecoder(model_config)

        self.with_cuda = with_cuda
Пример #7
0
        def init_model():
            transformer_config['encoder_layer'] = CodeTransformerLayer(
                **layer_config)
            encoder_config['transformer'] = CodeTransformer(
                CodeTransformerCoreConfig(**transformer_config))
            decoder_config['lm_encoder'] = TransformerLMEncoder(
                TransformerLMEncoderConfig(**encoder_config))
            model = CodeTransformerDecoder(
                TransformerLMDecoderConfig(**decoder_config))

            num_params = sum(
                [len(params.view(-1)) for params in model.parameters()])
            print(f"Model has {num_params} parameters")

            return model
Пример #8
0
    def _init_model(self,
                    lm_encoder: dict,
                    lm_decoder: dict,
                    with_cuda: bool,
                    label_smoothing=None):
        if hasattr(self.dataset_train, 'num_sub_tokens_output'):
            num_sub_tokens_output = self.dataset_train.num_sub_tokens_output
        else:
            num_sub_tokens_output = 5

        self.model_manager = CodeTransformerModelManager()
        if hasattr(self, 'pretrained_model'):
            self.model_lm = self.pretrained_model
            self.model_lm.output_subtokens_per_token = num_sub_tokens_output
        else:
            lm_encoder = self.generate_transformer_lm_encoder_config(
                lm_encoder)

            if label_smoothing is None:
                loss_fct = CrossEntropyLoss(ignore_index=-1)
            else:
                loss_fct = LabelSmoothingLoss(label_smoothing)

            model_config = TransformerLMDecoderConfig(
                lm_encoder=lm_encoder,
                sos_id=self.word_vocab[SOS_TOKEN],
                unk_id=self.word_vocab[UNKNOWN_TOKEN],
                loss_fct=loss_fct,
                use_pointer_network=self.use_pointer_network if hasattr(
                    self, "use_pointer_network") else False,
                output_subtokens_per_token=num_sub_tokens_output,
                target_vocab_size=len(self.method_name_vocab)
                if self.use_separate_vocab else None,
                **lm_decoder)

            self.model_lm = CodeTransformerDecoder(model_config)

        if hasattr(self, "freeze_encoder_layers"):
            layers = self.model_lm.lm_encoder.transformer.layers
            freeze_encoder_layers = len(
                layers) if self.freeze_encoder_layers == 'all' else min(
                    len(layers), self.freeze_encoder_layers)
            print(f"Freezing {freeze_encoder_layers} encoder layers.")
            for i in range(freeze_encoder_layers):
                for param in layers[i].parameters():
                    param.requires_grad = False

        self.with_cuda = with_cuda
Пример #9
0
    def generate_language_model_default_config(self,
                                               transformer_config: CodeTransformerCoreConfig = None) \
            -> TransformerLMDecoderConfig:
        if transformer_config is None:
            transformer_config = TestCodeTransformer.generate_transformer_default_config(
            )
        encoder_conf = TransformerLMEncoderConfig(transformer_config,
                                                  vocab_size=113,
                                                  num_node_types=5,
                                                  num_token_types=13,
                                                  subtokens_per_token=5,
                                                  input_nonlinearity="tanh")

        return TransformerLMDecoderConfig(encoder_conf,
                                          sos_id=-1,
                                          output_nonlinearity=None)
Пример #10
0
    def __init__(self, config: TransformerLMDecoderConfig):
        if not isinstance(config.lm_encoder, nn.Module):
            config.lm_encoder = XLNetLMEncoder(
                TransformerLMEncoderConfig(**config.lm_encoder))

        super(XLNetTransformerDecoder, self).__init__(config)
Пример #11
0
    def test_mini_dataset(self):
        def evaluate_predictions(logits, labels, loss=None):
            correct = logits.argmax(-1) == labels
            all_correct = correct.prod(-1)
            correct_tokens = all_correct.float().mean().cpu().item()
            ret = dict(correct_tokens=correct_tokens)
            if loss is not None:
                ret['loss'] = loss.detach().cpu().item()
            return ret

        BATCH_SIZE = 13
        NUM_PREDICT = 5

        dataloader = self.setup_mini_dataset()

        config = CodeTransformerCoreConfig(
            encoder_layer=CodeTransformerLayerConfig(d_model=16,
                                                     nhead=8,
                                                     dim_feedforward=32,
                                                     activation="gelu",
                                                     num_relative_distances=4,
                                                     use_token_distances=True,
                                                     use_content_content=True,
                                                     use_content_pos=True,
                                                     use_pos_content=True,
                                                     use_pos_pos=True),
            num_layers=4,
        )

        language_model_config = TransformerLMDecoderConfig(
            lm_encoder=TransformerLMEncoderConfig(
                config,
                vocab_size=len(self.word_vocab.vocabulary),
                num_node_types=len(self.node_type_vocab.vocabulary),
                num_token_types=len(self.token_type_vocab.vocabulary)),
            sos_id=-1)
        transformer_lm = TransformerLanguageModel(
            transformer_lm_encoder=language_model_config['lm_encoder'],
            output_nonlinearity=language_model_config['output_nonlinearity'],
            loss_fct=language_model_config['loss_fct'])
        batch: CTBatch = next(iter(dataloader))

        cuda = torch.cuda.is_available() and RUN_TESTS_ON_GPU
        if cuda:
            transformer_lm = transformer_lm.cuda()

        opt = optim.Adam(transformer_lm.parameters(), lr=1e-4)
        tq = tqdm(range(500))

        if RUN_TESTS_ON_GPU:
            with self.assertRaises(RuntimeError):
                # CPU input on CUDA model should fail
                output = transformer_lm.forward_batch(batch)
            batch = batch_to_device(batch, "cuda")

        assert not (batch.labels == self.word_vocab['</s>']).any().item()
        for _ in tq:
            output = transformer_lm.forward_batch(batch)
            output.loss.backward()
            opt.step()
            opt.zero_grad()
            evaluation = evaluate_predictions(output.logits, batch.labels)
            acc = evaluation['correct_tokens']
            tq.set_postfix(loss=output.loss.cpu().item(), acc=acc)

            predicted_tokens = output.logits.argmax(-1)
            generated_text = batch_decode(self.word_vocab, predicted_tokens)
            generated_text2 = [
                " ".join([
                    "_".join([
                        self.word_vocab.reverse_lookup(subtoken.item())
                        for subtoken in token
                    ]) for token in sample
                ]) for sample in predicted_tokens
            ]
            assert list(generated_text) == generated_text2
        assert acc > 0.98
Пример #12
0
    def load_model(self, run_id, snapshot_iteration, gpu=True):
        model_params = self.load_parameters(run_id, snapshot_iteration, gpu=gpu)
        config = self.load_config(run_id)
        model_config = self._prepare_model_config(config)

        language = config['data_setup']['language']
        use_only_ast = config['data_setup']['use_only_ast'] if 'use_only_ast' in config['data_setup'] else False
        data_manager = CTPreprocessedDataManager(DATA_PATH_STAGE_2, language)

        decoder_config = model_config['lm_decoder']

        vocabularies = data_manager.load_vocabularies()
        if len(vocabularies) == 3:
            word_vocab, token_type_vocab, node_type_vocab = vocabularies
            use_separate_vocab = False
        else:
            word_vocab, token_type_vocab, node_type_vocab, method_name_vocab = vocabularies
            use_separate_vocab = True

        encoder_config = model_config['lm_encoder']
        encoder_config['num_node_types'] = len(node_type_vocab)
        if use_only_ast:
            encoder_config['num_token_types'] = None
        else:
            encoder_config['num_token_types'] = len(token_type_vocab)
        encoder_config['vocab_size'] = len(word_vocab)
        encoder_config['transformer']['encoder_layer']['num_relative_distances'] = len(
            config['data_transforms']['relative_distances'])
        decoder_config['sos_id'] = word_vocab[SOS_TOKEN]
        if 'num_subtokens_output' in config['data_setup']:
            decoder_config['output_subtokens_per_token'] = config['data_setup']['num_subtokens_output']
        else:
            decoder_config['output_subtokens_per_token'] = NUM_SUB_TOKENS

        if 'use_pointer_network' in config['data_setup']:
            decoder_config['use_pointer_network'] = config['data_setup']['use_pointer_network']

        if ',' in data_manager.language:
            encoder_config['num_languages'] = len(data_manager.language.split(','))

        decoder_config['lm_encoder'] = encoder_config
        decoder_config['loss_fct'] = model_config['loss_fct']

        if use_separate_vocab:
            decoder_config['target_vocab_size'] = len(method_name_vocab)

        model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))

        try:
            model.load_state_dict(model_params)
        except RuntimeError as e:
            # In most cases, this is due to the legacy issue with encoder_self_attention
            model.add_module('encoder_self_attention',
                             MultiheadAttention(model.d_model, decoder_config['decoder_nhead'],
                                                dropout=decoder_config['decoder_dropout']))
            try:
                model.load_state_dict(model_params)
            except RuntimeError:
                decoder_config['concat_query_and_pointer'] = False
                model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))
                try:
                    model.load_state_dict(model_params)
                except:
                    decoder_config['concat_query_and_pointer'] = True
                    model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))
                    model.lm_encoder.language_embedding = None
                    try:
                        model.load_state_dict(model_params)
                    except:
                        decoder_config['concat_query_and_pointer'] = False
                        model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config))

                        class PositionalEncodingMock(nn.Module):
                            def forward(self, x, position):
                                return x

                        model.positional_encoding = PositionalEncodingMock()
                        model.load_state_dict(model_params)

        return model