示例#1
0
 def __init__(self):
     super(TextRank, self).__init__
     self.bert_embedding = Make_Embedding(tok=CamembertTokenizer(
         'C:/Users/theo.roudil-valentin/Documents/Resume/MLSUM/MLSUM_tokenizer.model'
     ),
                                          cpu=psutil.cpu_count())
     self.camem = CamembertModel(CamembertConfig())
示例#2
0
 def __init__(
         self):  #,num_labels=2): Nous on a pas de labels, donc pas besoin
     super(CustomCamembert, self).__init__()
     self.camembert = CamembertModel.from_pretrained("camembert-base")
     self.dropout = nn.Dropout(.05)
     self.classifier = nn.Linear(768, )
     self.MHA = nn.MultiheadAttention(embed_dim=768, num_heads=8)
示例#3
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
 def __init__(self, bert_model, num_classes):
     super(UmbertoCustom, self).__init__()
     self.encoder = CamembertModel.from_pretrained(bert_model)
     self.classifier = nn.Sequential(nn.Dropout(p=0.1), nn.Linear(768, 768),
                                     nn.Tanh(), nn.Dropout(p=0.1),
                                     nn.Linear(768, num_classes))
     self.loss = nn.CrossEntropyLoss()
示例#5
0
文件: core.py 项目: tofunlp/sister
    def __init__(
        self,
        lang: str = "en",
    ):
        try:
            from transformers import (AlbertModel, AlbertTokenizer, BertConfig,
                                      BertJapaneseTokenizer, BertModel,
                                      CamembertModel, CamembertTokenizer)
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            model_name = "albert-base-v2"
            tokenizer = AlbertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = AlbertModel.from_pretrained(model_name, config=config)
        elif lang == "fr":
            model_name = "camembert-base"
            tokenizer = CamembertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name, config=config)
        elif lang == "ja":
            model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
            tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config)

        self.tokenizer = tokenizer
        self.model = model
示例#6
0
文件: core.py 项目: MsLimon/sister
    def __init__(
            self,
            lang: str = 'en',
            ):
        try:
            from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer
            from transformers import AlbertModel, CamembertModel, AutoModel
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
            model = AlbertModel.from_pretrained("albert-base-v2")
        elif lang == "fr":
            tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
            model = CamembertModel.from_pretrained("camembert-base")
        elif lang == "es":
            tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
            model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
        elif lang == "ja":
            tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
            model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

        self.tokenizer = tokenizer
        self.model = model
    def test_output_embeds_base_model(self):
        model = CamembertModel.from_pretrained("camembert-base")
        model.to(torch_device)

        input_ids = torch.tensor(
            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
            device=torch_device,
            dtype=torch.long,
        )  # J'aime le camembert !
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 10, 768))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
        expected_slice = torch.tensor(
            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418],
              [-0.1561, -0.1127, 0.2687]]],
            device=torch_device,
            dtype=torch.float,
        )
        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
        # camembert.eval()
        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()

        self.assertTrue(
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 def load(self):
     """
     Load Camembert model from FAIR repo
     :return:
     """
     self.tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)
     self.model = CamembertModel.from_pretrained(MODEL_NAME)
     self.model.eval()
示例#9
0
 def __init__(
     self,
     camem=CamembertModel(CamembertConfig()),
     cosim=torch.nn.CosineSimilarity(dim=-1)
 ) -> None:
     super(BERTScore, self).__init__
     self.make_embedding = TextRank().make_embedding_bert
     self.camem = camem
     self.cosim = cosim
 def sentence_embeddings(self):
     if self.finetuned_bert == False:
         tokenizer = CamembertTokenizer.from_pretrained(
             stg.REGULAR_CAMEMBERT)
         model = CamembertModel.from_pretrained(stg.REGULAR_CAMEMBERT)
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             stg.FINED_TUNED_CAMEMBERT)
         model = CamembertModel.from_pretrained(stg.FINED_TUNED_CAMEMBERT)
     if torch.cuda.is_available() == True:
         print(
             '====== Cuda is Available, GPU will be used for this task ======'
         )
         torch.cuda.empty_cache()
         model.cuda()
         device = torch.device("cuda")
     embedding_all_text = []
     number_sentences = len(self.sentences)
     for i in tqdm(range(0, number_sentences, self.batch_size)):
         if ((i + self.batch_size) < number_sentences):
             batch = self.sentences[i:i + self.batch_size]
             encoded_input = self.get_batch_sentence_tokens(
                 batch, tokenizer)
         elif (i == number_sentences):
             pass
         else:
             batch = self.sentences[i:]
             encoded_input = self.get_batch_sentence_tokens(
                 batch, tokenizer)
         if torch.cuda.is_available() == True:
             encoded_input.to(device)
         with torch.no_grad():
             model_output = model(**encoded_input)
         sentence_embeddings_tensor = self.mean_pooling(
             model_output, encoded_input['attention_mask'])
         embedding_all_text.append(sentence_embeddings_tensor)
         if torch.cuda.is_available() == True:
             del encoded_input
             del sentence_embeddings_tensor
             torch.cuda.empty_cache()
     sentence_embeddings = self.torch_to_array(embedding_all_text)
     return sentence_embeddings
示例#11
0
 def __init__(
         self, device
 ):  #args, , load_pretrained_bert = False, bert_config = None):
     super(Summarizer, self).__init__()
     self.device = device
     self.bert = CamembertModel.from_pretrained("camembert-base")
     #BertModel.from_pretrained('bert-base-uncased')
     #Bert(args.temp_dir, load_pretrained_bert, bert_config)
     self.encoder = Classifier(self.bert.config.hidden_size)
     self.select_sent = select_sent
     # self.score=confusion_output
     self.to(device)
示例#12
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(CamemBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 511:
            logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511")
            max_seq_length = 511
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
示例#13
0
def load_initial_model():
    configuration = get_configuration()
    config = CamembertConfig(
        vocab_size=configuration["vocab_size"],
        hidden_size=configuration["hidden_size"],
        num_hidden_layers=configuration["num_hidden_layers"],
        intermediate_size=configuration["intermediate_size"],
        hidden_act=configuration["hidden_act"],
        hidden_dropout_prob=configuration["hidden_dropout_prob"],
        attention_probs_dropout_prob=configuration[
            "attention_probs_dropout_prob"],
        max_position_embeddings=configuration["max_position_embeddings"],
        type_vocab_size=configuration["type_vocab_size"])

    cam_model = CamembertModel(config)
    return cam_model
示例#14
0
    def __init__(
            self, file_service: FileService, device: str,
            pretrained_representations_options: PretrainedRepresentationsOptions
    ):
        super().__init__()

        self._device = device
        self.do_not_save: bool = (
            not pretrained_representations_options.fine_tune_pretrained and
            not pretrained_representations_options.fine_tune_after_convergence)

        self._include_pretrained = pretrained_representations_options.include_pretrained_model
        self._pretrained_model_size = pretrained_representations_options.pretrained_model_size
        self._pretrained_weights = pretrained_representations_options.pretrained_weights
        self._pretrained_max_length = pretrained_representations_options.pretrained_max_length
        self._pretrained_model: PreTrainedModel = None

        self._fine_tune_pretrained = pretrained_representations_options.fine_tune_pretrained
        self._fine_tune_after_convergence = pretrained_representations_options.fine_tune_after_convergence

        self._include_fasttext_model = pretrained_representations_options.include_fasttext_model

        if self._include_pretrained and self._pretrained_model_size and self._pretrained_weights:
            if pretrained_representations_options.pretrained_model == PretrainedModel.BERT:
                self._pretrained_model = BertModel.from_pretrained(
                    pretrained_representations_options.pretrained_weights)
            elif pretrained_representations_options.pretrained_model == PretrainedModel.CamemBERT:
                self._pretrained_model = CamembertModel.from_pretrained(
                    pretrained_representations_options.pretrained_weights)

            if pretrained_representations_options.fine_tune_pretrained:
                self._pretrained_model.train()
            else:
                self._pretrained_model.eval()

        if self._include_fasttext_model:
            assert pretrained_representations_options.fasttext_model is not None, 'fast text model is not supplied when include-fasttext-model is set to true'

            data_path = file_service.get_initial_data_path()
            fasttext_path = os.path.join(
                data_path, 'fasttext',
                pretrained_representations_options.fasttext_model)
            assert os.path.exists(
                fasttext_path), f'fast text model not found in {fasttext_path}'

            self._fasttext_dimension = pretrained_representations_options.fasttext_model_size
            self._fasttext_model = fasttext.load_model(fasttext_path)
示例#15
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(CamemBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 511:
            logging.warning(
                "CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511"
            )
            max_seq_length = 511
        self.max_seq_length = max_seq_length

        self.camembert = CamembertModel.from_pretrained(model_name_or_path)
        self.tokenizer = CamembertTokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.sep_token])[0]
    def create(cls,
               model_type='camem',
               model_name="camembert-base",
               embedding_size=768,
               hidden_dim=512,
               rnn_layers=1,
               lstm_dropout=0.5,
               device="cuda",
               mode="weighted",
               key_dim=64,
               val_dim=64,
               num_heads=3,
               attn_dropout=0.3,
               self_attention=False,
               is_require_grad=False):
        configuration = {
            'model_type': model_type,
            "model_name": model_name,
            "device": device,
            "mode": mode,
            "self_attention": self_attention,
            "is_freeze": is_require_grad
        }

        if 'camem' in model_type:
            config_bert = CamembertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name,
                                                   config=config_bert)
            model.to(device)
        elif 'flaubert' in model_type:
            config_bert = FlaubertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = FlaubertModel.from_pretrained(model_name,
                                                  config=config_bert)
            model.to(device)
        elif 'XLMRoberta' in model_type:
            config_bert = XLMRobertaConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = XLMRobertaModel.from_pretrained(model_name,
                                                    config=config_bert)
            model.to(device)
        elif 'M-Bert' in model_type:
            config_bert = BertConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config_bert)
            model.to(device)

        lstm = BiLSTM.create(embedding_size=embedding_size,
                             hidden_dim=hidden_dim,
                             rnn_layers=rnn_layers,
                             dropout=lstm_dropout)

        attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads,
                                  attn_dropout)
        model.train()
        self = cls(model=model, config=configuration, lstm=lstm, attn=attn)
        # if is_freeze:
        self.freeze()

        return self
示例#17
0
# ####

# bc = BertConfig(output_hidden_states=True)
# bert = BertModel(bc)
# bert = bert.from_pretrained("bert-base-uncased")
# bert = bert.from_pretrained("bert-base-multilingual-cased")
# bert = bert.eval()
# bert = bert.to(device)
# bert_tok = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# ####
# FR #
# ####

cbc = CamembertConfig(output_hidden_states=True)
bert = CamembertModel(cbc)
bert.from_pretrained("camembert-base")
bert = bert.eval()
bert = bert.to(device)
bert_tok = CamembertTokenizer.from_pretrained("camembert-base")

app = Flask(__name__)

# create a StackedEmbedding object that combines glove and forward/backward flair embeddings

SIZE_EMBED = -1
adaptive_pool = nn.AdaptiveAvgPool1d(SIZE_EMBED) if SIZE_EMBED > 0 else None
embedder = ELMoEmbeddings("small")
# bert_model_or_path="distilbert-base-uncased",
#   pooling_operation="mean", use_scalar_mix=True)
示例#18
0
    def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
                 include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1,
                 only_use_pretrain_bpe=False, truncate_embed=True):
        super().__init__()

        self.tokenizer = CamembertTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = CamembertModel.from_pretrained(model_dir_or_name)

        self.encoder.resize_token_embeddings(len(self.tokenizer))
        self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2
        encoder_layer_number = len(self.encoder.encoder.layer)
        if isinstance(layers, list):
            self.layers = [int(l) for l in layers]
        elif isinstance(layers, str):
            self.layers = list(map(int, layers.split(',')))
        else:
            raise TypeError("`layers` only supports str or list[int]")
        for layer in self.layers:
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                                                       f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."

        assert pool_method in ('avg', 'max', 'first', 'last')
        self.pool_method = pool_method
        self.include_cls_sep = include_cls_sep
        self.pooled_cls = pooled_cls
        self.auto_truncate = auto_truncate

        logger.info("Start to generate word pieces for word.")
        word_piece_dict = {'<s>': 1, '</s>': 1}
        found_count = 0
        new_add_to_bpe_vocab = 0
        unsegment_count = 0
        if "<s>" in vocab:
            warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin "
                          "and end of the input automatically, make sure you don't add <s> and </s> at the begin"
                          " and end.")

        unique = []
        for word, index in vocab:

            word_pieces = []
            word_pieces.extend(self.tokenizer.tokenize(
                word))  # , add_prefix_space=True))

            word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
            if 3 in word_token_ids:
                if word_pieces[word_token_ids.index(3)] not in unique:
                    unique.append(word_pieces[word_token_ids.index(3)])
                    unsegment_count += 1
            if not vocab._is_word_no_create_entry(word):
                if index != vocab.unknown_idx and word_pieces[0] == '<unk>':
                    if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
                            word) and not only_use_pretrain_bpe:
                        word_piece_dict[word] = 1
                        new_add_to_bpe_vocab += 1
                    unsegment_count += 1
                    continue
            found_count += 1
            for word_piece in word_pieces:
                word_piece_dict[word_piece] = 1

        if unsegment_count > 0:
            logger.info(f"{unsegment_count} words are unsegmented.")

        word_to_wordpieces = []
        word_pieces_lengths = []
        for word, index in vocab:
            if index == vocab.padding_idx:
                word = '<pad>'
            elif index == vocab.unknown_idx:
                word = '<unk>'
            word_pieces = self.tokenizer.tokenize(word)
            word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
            word_to_wordpieces.append(word_pieces)
            word_pieces_lengths.append(len(word_pieces))
        self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>')
        self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>')
        self._word_pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids(
            '<pad>')
        self.word_to_wordpieces = np.array(word_to_wordpieces)
        self.register_buffer('word_pieces_lengths',
                             torch.LongTensor(word_pieces_lengths))
        self.encoder.resize_token_embeddings(len(self.tokenizer))
        logger.debug("Successfully generate word pieces.")
示例#19
0
 def __init__(self, text_ptm_dir, text_ft_dim, num_classes):
     super(TextOnly, self).__init__()
     self.camembert = CamembertModel.from_pretrained(text_ptm_dir)
     self.classifier = nn.Linear(text_ft_dim, num_classes, bias = False)
示例#20
0
def main():
    if args.input_file :
        input_coprus=readfile(args.input_file)
    else :
        print("Could not find the input file !")
        exit()

    if args.output_file :
        output_file=open(args.output_file,"w")
    else :
        print("Could not find the output file !")
        exit()

    tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-base-ccnet')
    # load model
    model = CamembertModel.from_pretrained('camembert/camembert-base-ccnet')

    # read_data

    for line in input_coprus:
        line=line.rstrip()
        sent=line.split(' ')
        # encode() automatically adds the classification token <s>
        token_ids = tokenizer.encode(line)
        tokens = [tokenizer._convert_id_to_token(idx) for idx in token_ids]
        # unsqueeze token_ids because batch_size=1
        token_ids = torch.tensor(token_ids).unsqueeze(0)
        # forward method returns a tuple (we only want the logits : last output layer)
        # squeeze() because batch_size=1
        output = model(token_ids)[0].squeeze()
        #output[0] is th output of CLS token (<s>), which is the first token, can be considered as the sentence embeddings
        # mapp each word to its sentence embeddings taking into account splitted words
        embeddings=len(sent)*[[]]
        words=len(sent)*[""]
        l=-1
        for i in range (1,len(output)-1):
            if re.match("^▁",tokens[i]) :
                # write to file 
                if l != -1:
                    if len( embeddings[l] ) >1:
                        em=numpy.array(embeddings[l])
                        word_vec=list(numpy.sum(em,axis=0))
                    else:
                        word_vec=embeddings[l][0]
                    output_file.write(words[l]+" "+" ".join([str(x) for x in word_vec])+"\n")
                
                l=l+1
                print (i, l, tokens[i])
                words[l]=re.sub("▁","",tokens[i])
                embeddings[l]=[output[i].detach().numpy()]
            else:
                print (i, l, tokens[i])
                words[l]=words[l]+tokens[i]
                print (words[l])
                embeddings[l].append(output[i].detach().numpy())

        if len( embeddings[l] ) >1:
            em=numpy.array(embeddings[l])
            word_vec=list(numpy.sum(em,axis=0))
        else:
            word_vec=embeddings[l][0]
        output_file.write(words[l]+" "+" ".join([str(x) for x in word_vec])+"\n")

        output_file.write("\n")
示例#21
0
    pred = classifier.predict(cls_embedding)
    top_emoji = emoji_list[np.argmax(pred)]
    top3_emojis = [emoji_list[k] for k in np.argpartition(pred[0], -3)[-3:].tolist()]
    top3_emojis = ' '.join(top3_emojis)
    top_proba = max(pred[0])
    top3_proba = [pred[0][k] for k in np.argpartition(pred[0], -3)[-3:].tolist()]
    print('\nPrediction succesfully run.\n')

    return render_template('index.html', top1=top_emoji, 
                                         top3=top3_emojis,
                                         top1proba=top_proba,
                                         top3proba=top3_proba)


if __name__ == "__main__":
    
    # load model
    print('Loading model...')
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    embedder = CamembertModel.from_pretrained('camembert-base')
    classifier = keras.models.load_model("../models/100k_sentences_averaged_embedding.h5")
    print('\nModel successfuly loaded!\n')

    # load emoji list
    with open('top100.txt') as f:
        emoji_list = ast.literal_eval(f.read())
    
    app.run(host="0.0.0.0", port=5000, debug=True)
    

示例#22
0
# On va tester deux Embedding différentes : CamemBERT et Word2Vec
# D'autres sont possibles évidemment.

#On va entrainer un tokenizer :
ME = Make_Extractive(cpu=cpu)
tok = ME.make_tokenizer(Art, 12000, 'MLSUM_tokenizer', name='MLSUM_tokenizer')

# On a déjà un Tokenizer entraîné, donc on va le réutiliser :
#On rentre notre SentencePiece model dans le tokenizer Camembert
#%%
from transformers import CamembertTokenizer
tok = CamembertTokenizer('MLSUM_tokenizer.model')
# %%
# On charge le modèle BERT
from transformers import CamembertModel, BertModel, RobertaModel
camem = CamembertModel.from_pretrained("camembert-base")
camem.config.hidden_size
#%%
camem.eval()
encod = tok(articles[0][0:1])
input_id = torch.tensor(encod['input_ids'])
att_mask = torch.tensor(encod['attention_mask'])
# %%
embedding = camem(input_id, att_mask)


# %%
class Make_Embedding():
    def __init__(self, tok=None, cpu=None) -> None:
        super(Make_Embedding, self).__init__
        self.tokenizer = tok
示例#23
0
# This env variable will disable SSL verification
# for requests (because of Milliman SSL)
# https://stackoverflow.com/questions/48391750/disable-python-requests-ssl-validation-for-an-imported-module
os.environ["CURL_CA_BUNDLE"] = ""
# disable warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")

BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
DATA_PATH = os.path.join(BASE_PATH, "input/tweets.csv")
MODEL_PATH = os.path.join(BASE_PATH, "output/camembert")  # MODEL SAVE PATH

# Set Pytorch device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCHS = 10
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
WEIGHT_DECAY = 0.01
TOKENIZER = CamembertTokenizer.from_pretrained(
    "camembert-base",
    do_lower_case=True,
    verify=False,
)
CAMEMBERT_MODEL = CamembertModel.from_pretrained("camembert-base")

# ? valid batch size different de train batch size
# ? BCEWithLogit more than BCE
示例#24
0
        b, _ = self.make_embedding(article, self.camem)
        b = torch.stack(b)
        VSA = b.mean(dim=0)
        score = self.cosim(VSA, b)
        return score

    def make_summary(self, article, k=3):
        score = self.make_score(article)
        score = score.topk(k=k)[1]
        resume = [article[i] for i in score]
        return resume


#%%
from transformers import CamembertModel, CamembertConfig
camem = CamembertModel(CamembertConfig())  #.from_pretrained("camembert-base")
#%%
i = 1
TR = TextRank()
b, d = TR.make_embedding_bert(Art[i], camem)
b
#%%
mb = TR.mat_sim(b)
sb = TR.scores(mb)
sb = [s[0] for s in sb]
#%%
w = TR.make_embedding_W2V(Art[0], W2V)
print(len(w), w[0].shape)
mw = TR.mat_sim(w)
sw = TR.scores(mw)
sw = [s[0] for s in sw]
示例#25
0
def test_camembert_embeddings():
    camembert_model: str = "camembert-base"

    tokenizer = CamembertTokenizer.from_pretrained(camembert_model)
    model = CamembertModel.from_pretrained(
        pretrained_model_name_or_path=camembert_model, output_hidden_states=True
    )
    model.to(flair.device)
    model.eval()

    s: str = "J'aime le camembert !"

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #   0       1          2         3         4      5       6       7    8      9
    #
    # '<s>',   '▁J',      "'",     'aime',   '▁le', '▁ca', 'member', 't', '▁!', '</s>'
    #           \          |         /         |      \       |        /   |
    #                    J'aime                le         camembert        !
    #
    #                      0                   1              2            3

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = CamembertEmbeddings(
            pretrained_model_name_or_path=camembert_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first")

    camembert_first_subword_embedding_ref = first_layer[5].tolist()
    camembert_first_subword_embedding_actual = sentence_first_subword.tokens[
        2
    ].embedding.tolist()

    assert (
        camembert_first_subword_embedding_ref
        == camembert_first_subword_embedding_actual
    )

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last")

    camembert_last_subword_embedding_ref = first_layer[7].tolist()
    camembert_last_subword_embedding_actual = sentence_last_subword.tokens[
        2
    ].embedding.tolist()

    assert (
        camembert_last_subword_embedding_ref == camembert_last_subword_embedding_actual
    )

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last"
    )

    camembert_first_last_subword_embedding_ref = torch.cat(
        [first_layer[5], first_layer[7]]
    ).tolist()
    camembert_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        2
    ].embedding.tolist()

    assert (
        camembert_first_last_subword_embedding_ref
        == camembert_first_last_subword_embedding_actual
    )

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean")

    pcamembert_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[5], first_layer[6], first_layer[7]]
    ).tolist()
    camembert_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        2
    ].embedding.tolist()

    assert (
        pcamembert_mean_subword_embedding_ref == camembert_mean_subword_embedding_actual
    )

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(
        sentence="Paris", pooling_operation="first", layers="1,2,3,4"
    )

    ref_embedding_size = 4 * 768
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="TGW",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 768
    actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
示例#26
0
def extractHiddenState():
    transformer = CamembertModel.from_pretrained("camembert-base")
    return ExtractHiddenState(transformer)
 def __init__(self, pretrained_model_name='camembert-base'):
     super(CamembertClassifier, self).__init__()
     self.encoder = CamembertModel.from_pretrained(pretrained_model_name,output_attentions=True)
     self.cls_layer = nn.Linear(self.encoder.pooler.dense.out_features, 5)