Exemplo n.º 1
0
    def __init__(self, max_seq_len : int, model_name: str, model_path: Optional[str] = '', device: Optional[str] = 'cpu'):
        super(TextEmbedder, self).__init__()
        """
        Parameters
        ----------
        max_len : int

        model_name : str
            The name of the model, should be one of 'word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2'
        model_path : str, optional
            The path to the w2v file / finetuned Transformer model path. Required for w2v.
        """
        
        assert model_name in ['word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2']
        self.max_seq_len = max_seq_len
        self.model_name = model_name
        self.device = torch.device(device)
        if model_path == '':
            model_path = model_name  # TODO check if the 
        print('TextEmbedder: Loading model {} ({})'.format(model_name, model_path))
        if model_name == 'word2vec':
            assert os.path.isfile(model_path)
            self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', model_max_length=self.max_seq_len+1)
            self._load_weibo_w2v(model_path)
            self.embed_dim = 300
        elif model_name in ['vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2', 'jordan-m-young/buzz-article-gpt-2']:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len)
            self.model = AutoModel.from_pretrained(model_path, return_dict=True).to(self.device)  # T5 for news doesn't have 'add_pooling_layer' option
            self.embed_dim = 768
        else:
            assert model_path in ['xlm-roberta-base', 'xlm-roberta-large'] or os.path.isdir(model_path)
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len)
            self.model = XLMRobertaModel.from_pretrained(model_path, return_dict=True, add_pooling_layer=False).to(self.device)
            self.embed_dim = 768
        print('TextEmbedder: Finished loading model {}'.format(model_name))
Exemplo n.º 2
0
    def __init__(self, vocabs, opt, predict_inverse=False):
        super(XLMRPredictor, self).__init__(vocabs=vocabs, opt=opt)

        self.xlmr = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.model = nn.Sequential(*list(self.xlmr.children())[:-1])
        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
Exemplo n.º 3
0
    def __init__(self,
                 model_dir_or_name: str,
                 layers: str = '-1',
                 pooled_cls: bool = False):
        super().__init__()

        self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
        #  检查encoder_layer_number是否合理
        encoder_layer_number = len(self.encoder.encoder.layer)

        if isinstance(layers, list):
            self.layers = [int(l) for l in layers]
        elif isinstance(layers, str):
            self.layers = list(map(int, layers.split(',')))
        else:
            raise TypeError("`layers` only supports str or list[int]")

        for layer in self.layers:
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a RoBERTa model with {encoder_layer_number} layers."
            else:
                assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a RoBERTa model with {encoder_layer_number} layers."

        self._cls_index = self.tokenzier.encoder['<s>']
        self._sep_index = self.tokenzier.encoder['</s>']
        # 需要用于生成word_piece
        self._wordpiece_pad_index = self.tokenzier.encoder['<pad>']
        self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
        self.pooled_cls = pooled_cls
Exemplo n.º 4
0
    def get_embeddings_contextualized(self,
                                      dataset: CLIRDataset) -> torch.Tensor:
        """
        Compute average contextualized embeddings for multi-piece words
        """
        # encoded input sentences with XLMRoberta
        model = XLMRobertaModel.from_pretrained('xlm-roberta-base',
                                                output_hidden_states=True)
        all_input_ids = [input_ids.input_ids for input_ids in dataset.data]
        all_token_len = [token_len.token_len for token_len in dataset.data]
        all_input_ids = torch.tensor(all_input_ids)
        all_outputs = model(all_input_ids)
        embeddings = all_outputs[2][
            0]  # shape (batch_size, max_seq_len, emb_dim)

        doc_size, _, emb_size = embeddings.shape

        # average all pieces for multi-piece words
        idxs, masks, token_num, token_len = token_lens_to_idxs(all_token_len)
        # reshape idxs from (batch_size, seq_len) to (batch_size, seq_len, embed_size)
        idxs = all_input_ids.new(idxs).unsqueeze(-1).expand(
            doc_size, -1, emb_size) + 1
        masks = embeddings.new(masks).unsqueeze(-1)
        outputs = torch.gather(embeddings, 1, idxs) * masks
        outputs = outputs.view(doc_size, token_num, token_len, emb_size)
        outputs = outputs.mean(dim=2)
        return outputs
Exemplo n.º 5
0
 def __init__(self, config):
     super(Model, self).__init__()
     self.bert = XLMRobertaModel.from_pretrained(config.pretrained_path)
     for param in self.bert.parameters():
         param.requires_grad = True
     self.fc = nn.Linear(config.hidden_size, 1)
     self.sigmoid = nn.Sigmoid()
    def __init__(self, choose_model):
        """Initialize Text Encoder.

        Args:
            choose_model (str): Only XLM-R possible for now.
        """
        self.model_max_length = 128
        if choose_model.lower() == "XLM-R".lower():
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(
                'xlm-roberta-base', model_max_length=self.model_max_length)
            self.model = XLMRobertaModel.from_pretrained(
                'xlm-roberta-base', output_hidden_states=True)
            self.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            self.model.to(self.device, non_blocking=True)

            self.PAD_TOKEN = "<pad>"
            self.BOS_TOKEN = "<s>"
            self.EOS_TOKEN = "</s>"
            self.UNK_TOKEN = "<unk>"

            self.add_special_token = True
            self.pad_to_max_length = True

            self.target_embedding_matrix = []
            self.proj_embedding_source_target = []
            self.src_word2ind = {}
            self.trg_word2ind = {}
            self.src_ind2word = {}
            self.trg_ind2word = {}
            self.norm_trg_embedding_matrix = []

        else:
            assert False, print("No correct model was chosen!!")
Exemplo n.º 7
0
    def __init__(self, num_classes):
        super(XLMRoBERTaNER, self).__init__()
        self.embedding_dim = 768
        self.num_classes = num_classes

        self.RoBERTa = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.linear = nn.Linear(self.embedding_dim, self.num_classes)
Exemplo n.º 8
0
 def __init__(self, config, args):
     super(XPairModel, self).__init__(config)
     self.args = args
     self.num_labels = args.num_pair_labels
     self.roberta = XLMRobertaModel(config=config)
     self.classifier = SequenceClassifier(config.hidden_size,
                                          self.num_labels)
Exemplo n.º 9
0
 def __init__(self):
     super(ToxicSimpleNNModel, self).__init__()
     self.backbone = XLMRobertaModel.from_pretrained(BACKBONE_PATH)
     self.dropout = nn.Dropout(0.25)
     self.linear = nn.Linear(
         in_features=self.backbone.pooler.dense.out_features*2,
         out_features=2,
     )
Exemplo n.º 10
0
 def __init__(self, config, model_name, only_embedding=True, output_hidden_states=True):
     super(XLMTokenEmbedder, self).__init__(config)
     self.config = config
     self.only_embedding = only_embedding
     self.model = XLMRobertaModel.from_pretrained(model_name, output_hidden_states=output_hidden_states)
     if self.only_embedding:
         self.model = self.model.get_input_embeddings()
         self.model.weight.requires_grad = False
Exemplo n.º 11
0
 def __init__(self, config):
     super(Bert_CRF, self).__init__(config)
     self.num_labels = config.num_labels
     self.bert = XLMRobertaModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
     self.init_weights()
     self.crf = CRF(self.num_labels, batch_first=True)
Exemplo n.º 12
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = XLMRobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()
Exemplo n.º 13
0
 def __init__(self, hparams):
     super().__init__()
     self.encoder = XLMRobertaModel.from_pretrained(
         hparams.pretrained_path)  # load pretrained model
     self.encoder_size = hparams.bert_output  # encoder output size - 768 for base architecture
     self.num_output = hparams.num_output  # number of label - 6
     self.fc = nn.Linear(self.encoder_size,
                         self.num_output)  # fully connected layer
     self.out = nn.Sigmoid()
Exemplo n.º 14
0
 def __init__(self, config, args):
     super(XNLUModel, self).__init__(config)
     self.args = args
     self.num_intent_labels = len(get_intent_labels(args))
     self.num_slot_labels = len(get_slot_labels(args))
     self.roberta = XLMRobertaModel(config=config)
     self.intent_classifier = SequenceClassifier(config.hidden_size,
                                                 self.num_intent_labels)
     self.slot_classifier = TokenClassifier(config.hidden_size,
                                            self.num_slot_labels)
Exemplo n.º 15
0
 def __init__(self, model_name, num_labels):
     """
     Args:
         model_name: model name, eg, roberta-base'
         num_labels: number of classes to classify
     """
     super().__init__()
     self.w = nn.Linear(768, 1, bias=False)
     self.bert = XLMRobertaModel.from_pretrained(model_name)
     self.prediction_layer = nn.Linear(768, num_labels)
     self.init_weights()
def make_pretrained_transformer_and_tokenizer(transformer_name: str):
    if 'distilgpt2' in transformer_name:
        print("DistilGPT2!")
        model = GPT2Model.from_pretrained('distilgpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
    else:
        print(f"Loading {transformer_name}!")
        model = XLMRobertaModel.from_pretrained(transformer_name)
        tokenizer = XLMRobertaTokenizer.from_pretrained(transformer_name)

    return model, tokenizer
Exemplo n.º 17
0
    def __init__(self, config, model_name=None, smoothing=0.0):
        super(XLMRobertaForSequenceClassification, self).__init__(config)
        self.num_labels = 42
        self.l2_reg_lambda = config.l2_reg_lambda
        if model_name is not None:
            self.xlmroberta = XLMRobertaModel.from_pretrained(model_name,
                                                              config=config)
        else:
            self.xlmroberta = XLMRobertaModel(config)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        classifier_size = config.hidden_size * 3
        self.classifier = nn.Linear(classifier_size, self.config.num_labels)
        self.latent_size = config.hidden_size
        self.latent_type = nn.Parameter(torch.FloatTensor(
            3, config.hidden_size),
                                        requires_grad=True)

        if smoothing == 0.0: self.smoothing = False
        else: self.smoothing = True
Exemplo n.º 18
0
    def __init__(self, use_aux=True):
        super(ToxicSimpleNNModel, self).__init__()
        self.backbone = XLMRobertaModel.from_pretrained(BACKBONE_PATH)
        self.dropout = nn.Dropout(0.3)
        aux_len = 0

        if use_aux:
            aux_len = 5
        self.linear = nn.Linear(
            in_features=self.backbone.pooler.dense.out_features * 2,
            out_features=2 + aux_len,
        )
Exemplo n.º 19
0
 def __init__(self, config, args):
     super(RBERT, self).__init__(config)
     self.roberta = XLMRobertaModel(config=config)
     self.num_labels = config.num_labels
     self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size,
                                 0.1)
     self.e1_fc_layer = FCLayer(config.hidden_size, config.hidden_size, 0.1)
     self.e2_fc_layer = FCLayer(config.hidden_size, config.hidden_size, 0.1)
     self.label_classifier = FCLayer(config.hidden_size * 3,
                                     self.num_labels,
                                     0.1,
                                     use_activation=False)
Exemplo n.º 20
0
 def __init__(self, config, num_hidden_layers, type="bert"):
     super().__init__()
     if type == "bert":
         self.bert = BertModel.from_pretrained(
             "bert-base-multilingual-cased",
             num_hidden_layers=num_hidden_layers)
     elif type == "xlmr":
         config.num_hidden_layers = 3
         # self.bert = XLMRobertaModel.from_pretrained("xlm-roberta-base", num_hidden_layers=num_hidden_layers)
         self.bert = XLMRobertaModel(config)
     else:
         raise KeyError(f"{type} is not a valid type!")
Exemplo n.º 21
0
    def __init__(self):
        super(XLMRobertaLargeTC, self).__init__()
        config = XLMRobertaConfig.from_pretrained('xlm-roberta-large',
                                                  output_hidden_states=True)
        self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large',
                                                           config=config)

        self.fc = nn.Linear(config.hidden_size, 1)
        self.dropout = nn.Dropout(p=0.2)

        # initialize weight
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
Exemplo n.º 22
0
    def __init__(self,
                 model_dir_or_name: str,
                 layers: str = '-1',
                 pooled_cls: bool = False):
        super().__init__()

        self.tokenzier = XLMRobertaTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name)

        self._cls_index = self.tokenzier.encoder['<s>']
        self._sep_index = self.tokenzier.encoder['</s>']
        self._wordpiece_pad_index = self.tokenzier.encoder['<pad>']
        self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
        self.pooled_cls = pooled_cls
Exemplo n.º 23
0
def build_model():
    src_encoder = XLMRobertaModel.from_pretrained('xlm-roberta-large')
    en2fr = TransformerModel.from_pretrained(
        '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/',
        checkpoint_file='model.pt',
        bpe='subword_nmt',
        bpe_codes=
        '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/bpecodes'
    )
    tgt_encoder = [
        model for name, model in en2fr.named_modules()
        if name == 'models.0.encoder'
    ][0]

    return src_encoder, tgt_encoder
Exemplo n.º 24
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels_list

        self.roberta = XLMRobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.pooler = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size), nn.Tanh())
        self.classifiers = nn.ModuleList([
            nn.Linear(config.hidden_size, num_label)
            for num_label in self.num_labels
        ])

        self.init_weights()
Exemplo n.º 25
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(XLMRoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)

        if max_seq_length > self.tokenizer.max_len_single_sentence:
            logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence)
            max_seq_length = self.tokenizer.max_len_single_sentence
        self.max_seq_length = max_seq_length
Exemplo n.º 26
0
def xlmr_model():
    config = XLMRobertaConfig(
        vocab_size=251000,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act='gelu',
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=256,
        type_vocab_size=2,
        is_decoder=False,
        initializer_range=0.02,
    )
    return XLMRobertaModel(config=config)
    def test_xlm_roberta_large(self):
        model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True)
        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
        # The dog is cute and lives in the garden house

        expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
        expected_output_values_last_dim = torch.tensor(
            [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]]
        )
        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
        #  xlmr.eval()
        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]

        output = model(input_ids)["last_hidden_state"].detach()
        self.assertEqual(output.shape, expected_output_shape)
        # compare the actual values for a slice of last dim
        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
Exemplo n.º 28
0
    def __init__(self, config, num_intents=12, num_slots=31, return_dict=True):
        # bug
        # num intents not passing through when loading model
        super(JointClassifier, self).__init__(config)

        self.num_labels = config.num_labels

        self.num_intent_labels = num_intents
        self.num_slot_labels = num_slots

        self.roberta = XLMRobertaModel(config, add_pooling_layer=True)
        self.intent_clf = IntentClassifier(768, self.num_intent_labels)
        self.slot_clf = SlotClassifier(768, self.num_slot_labels)

        self.return_dict = return_dict

        #initial lize weights. Uses inherited Roberta Init
        self.init_weights()
Exemplo n.º 29
0
def words_embedding(topic_abstract):
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

    # raw_dataset = load_raw_dataset(DATA_ROOT_PATH)
    # id_texts = load_tweet_ids_with_text(raw_dataset)
    # ids = [id_texts[idx][0] for idx in range(len(id_texts))]
    # texts = [id_texts[idx][1] for idx in range(len(id_texts))]
    topic_abstract_text = ''
    for word in topic_abstract:
        topic_abstract_text += word + ' '

    result = []
    input_ids = torch.tensor(
        [tokenizer.encode(topic_abstract_text,
                          add_special_tokens=True)])[:, :512]
    with torch.no_grad():
        outputs = model(input_ids)[1].tolist()
    return np.array(outputs)
Exemplo n.º 30
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(XLMRoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case
        self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)

        if max_seq_length > self.tokenizer.max_len_single_sentence:
            logging.warning("XLM-RoBERTa only allows a max_seq_length of " +
                            self.tokenizer.max_len_single_sentence)
            max_seq_length = self.tokenizer.max_len_single_sentence
        self.max_seq_length = max_seq_length

        self.cls_token_id = self.tokenizer.cls_token_id
        self.eos_token_id = self.tokenizer.eos_token_id