示例#1
0
文件: core.py 项目: MsLimon/sister
    def __init__(
            self,
            lang: str = 'en',
            ):
        try:
            from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer
            from transformers import AlbertModel, CamembertModel, AutoModel
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
            model = AlbertModel.from_pretrained("albert-base-v2")
        elif lang == "fr":
            tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
            model = CamembertModel.from_pretrained("camembert-base")
        elif lang == "es":
            tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
            model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
        elif lang == "ja":
            tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
            model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

        self.tokenizer = tokenizer
        self.model = model
示例#2
0
 def load_tokenizer(self):
     # Load the tokenizer.
     if self.verbose == True:
         print('Loading {} tokenizer...'.format(self.model_name))
     if self.model_name == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'distilbert':
         self.tokenizer = DistilBertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'albert':
         self.tokenizer = AlbertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'bart':
         self.tokenizer = BartTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'xlnet':
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type,
                                                         do_lower_case=True)
     if self.model_name == 'roberta':
         self.tokenizer = RobertaTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'camenbert':
         self.tokenizer = CamembertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'flaubert':
         self.tokenizer = FlaubertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'gpt2':
         self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
def run_generation(text):
    tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

    tokenized_text = tokenizer.encode(text, add_special_tokens=True)
    print('\nTokenized text: %s (%s)' %(tokenized_text, tokenizer.decode(tokenized_text)))

    input_ids = torch.tensor(tokenized_text).unsqueeze(0)

    model = CamembertForMaskedLM.from_pretrained("camembert-base", resume_download=True)
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids)

        masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
        print('Masked index: %d (%s)\n' % (masked_index, tokenizer.mask_token))

        last_hidden_states = outputs[0]
        logits = last_hidden_states[0, masked_index, :]
        prob = logits.softmax(dim=0)
        values, indices = prob.topk(k=5, dim=0)

        return list(zip(
            [tokenizer.decode([x]) for x in indices],
            [round(v.item(), 2) for v in values]
        ))
示例#4
0
 def __init__(self):
     super(TextRank, self).__init__
     self.bert_embedding = Make_Embedding(tok=CamembertTokenizer(
         'C:/Users/theo.roudil-valentin/Documents/Resume/MLSUM/MLSUM_tokenizer.model'
     ),
                                          cpu=psutil.cpu_count())
     self.camem = CamembertModel(CamembertConfig())
示例#5
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
def get_features(input_text, extractor):
    # REMINDER: uncomment dependencies in requirements.txt needed for the feature extractor

    if extractor == 'wangchanberta':
        # import transformers
        from tqdm.auto import tqdm
        from transformers import CamembertTokenizer, pipeline

        # create tokenizer & feature extractor
        tokenizer = CamembertTokenizer.from_pretrained(
            'airesearch/wangchanberta-base-att-spm-uncased', revision='main')
        tokenizer.additional_special_tokens = [
            '<s>NOTUSED', '</s>NOTUSED', '<_>'
        ]

        feature_extractor = pipeline(
            task='feature-extraction',
            tokenizer=tokenizer,
            model=f'airesearch/wangchanberta-base-att-spm-uncased',
            revision='main')

        # get features from last 4 states
        input_text = input_text[:415]
        last_k = 4
        hidden_states = feature_extractor(input_text)[0]
        last_k_layers = [
            hidden_states[i] for i in [-i for i in range(1, last_k + 1)]
        ]
        cat_hidden_states = np.array(sum(last_k_layers, []))
        return cat_hidden_states[None, :]

    else:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
        return model.encode(input)
示例#7
0
文件: core.py 项目: tofunlp/sister
    def __init__(
        self,
        lang: str = "en",
    ):
        try:
            from transformers import (AlbertModel, AlbertTokenizer, BertConfig,
                                      BertJapaneseTokenizer, BertModel,
                                      CamembertModel, CamembertTokenizer)
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            model_name = "albert-base-v2"
            tokenizer = AlbertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = AlbertModel.from_pretrained(model_name, config=config)
        elif lang == "fr":
            model_name = "camembert-base"
            tokenizer = CamembertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name, config=config)
        elif lang == "ja":
            model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
            tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config)

        self.tokenizer = tokenizer
        self.model = model
示例#8
0
def predict_camembert(df: pd.DataFrame) -> pd.DataFrame:
    """
    predict the sentiment of reviews
    :param df: dataframe with reviews
    :return: dataframe: dataframe with prediction of reviews
    """
    df['space'] = ' '
    df['comments'] = df[['titre', 'space', 'comment']].fillna('').sum(axis=1)
    df = df.dropna(subset=['comments'], axis="rows")
    comments = df['comments'].to_list()
    # camemBERT
    state_dict = torch.load("camemBERT_38000_state_dict.pt",
                            map_location=torch.device('cpu'))
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base', num_labels=2, state_dict=state_dict)

    # Initialize CamemBERT tokenizer
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base',
                                                   do_lower_case=True)

    # Encode the comments
    tokenized_comments_ids = [
        tokenizer.encode(comment, add_special_tokens=True, max_length=MAX_LEN)
        for comment in comments
    ]
    # Pad the resulted encoded comments
    tokenized_comments_ids = pad_sequences(tokenized_comments_ids,
                                           maxlen=MAX_LEN,
                                           dtype="long",
                                           truncating="post",
                                           padding="post")

    # Create attention masks
    attention_masks = []
    for seq in tokenized_comments_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    prediction_inputs = torch.tensor(tokenized_comments_ids)
    prediction_masks = torch.tensor(attention_masks)

    predictions = []
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(prediction_inputs.to(device),
                        token_type_ids=None,
                        attention_mask=prediction_masks.to(device))
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.extend(np.argmax(logits, axis=1).flatten())

    df = pd.DataFrame(
        data={
            "site": df["site"],
            "date": df["date"],
            "review": df["review"],
            "sentiment": predictions
        })
    return df
示例#9
0
    def __init__(
        self,
        pretrained_embedding=None,
        architecture_function=None,
        text_input_column="clean_text",
        meta_input_list=("extension", "dayofweek", "hour", "min"),
        vocab_size=25000,
        seq_size=100,
        embedding_dim=200,
        loss="categorical_crossentropy",
        activation="softmax",
        batch_size=4096,
        n_epochs=15,
        bert_tokenizer="jplu/tf-camembert-base",
        bert_model="jplu/tf-camembert-base",
        **kwargs,
    ):
        self.architecture_function = architecture_function
        self.pretrained_embedding = pretrained_embedding
        if self.architecture_function.__name__ != "bert_model":
            self.tokenizer = Tokenizer(input_column=text_input_column)
        elif "camembert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import CamembertTokenizer

                self.tokenizer = CamembertTokenizer.from_pretrained(
                    bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        elif "flaubert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import XLMTokenizer

                self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        else:
            raise NotImplementedError(
                "Bert tokenizer {} not implemented".format(bert_tokenizer))
        self.text_input_column = text_input_column
        self.meta_input_list = meta_input_list
        self.vocab_size = vocab_size
        self.seq_size = seq_size
        self.embedding_dim = embedding_dim
        self.loss = loss
        self.activation = activation
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.bert_model = bert_model
        self.nb_labels = 0
        self.nb_meta_features = 0
        self.vocabulary = []
        self.vocabulary_dict = {}
 def load(self):
     """
     Load Camembert model from FAIR repo
     :return:
     """
     self.tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)
     self.model = CamembertModel.from_pretrained(MODEL_NAME)
     self.model.eval()
示例#11
0
def get_roberta_tokenizer(pretrained_cfg_name: str,
                          do_lower_case: bool = True):
    # still uses HF code for tokenizer since they are the same
    if "camembert" in pretrained_cfg_name:
        return CamembertTokenizer.from_pretrained(pretrained_cfg_name,
                                                  do_lower_case=do_lower_case)
    return RobertaTokenizer.from_pretrained(pretrained_cfg_name,
                                            do_lower_case=do_lower_case)
示例#12
0
def encode_tweets(tweets, max_length=MAX_LENGTH):
    tokenizer = Tokenizer.from_pretrained("camembert-base")
    encoded = np.zeros(shape=(len(tweets), max_length), dtype=np.int32)

    for i, tweet in enumerate(tweets):
        tweet_encoded = tokenizer.encode(tweet)
        encoded[i][:len(tweet_encoded)] = tweet_encoded

    return encoded
    def __init__(self, language):
        self.language = language

        if self.language == "French":
            self.tokenizer = CamembertTokenizer.from_pretrained(
                Transformers[self.language])
        else:
            self.tokenizer = BertTokenizer.from_pretrained(
                Transformers[self.language])
def main():
    #argparser
    parser = argparse.ArgumentParser(
        prog="train_mlm_camembert_thai.py",
        description="train mlm for Camembert with huggingface Trainer",
    )

    #required
    parser.add_argument("--bpe_tokenizer",
                        type=str,
                        default='sentencepiece',
                        help='Specify the name of BPE Tokenizer')
    parser.add_argument("--vocab_size", type=int, default=52000)
    parser.add_argument("--min_frequency", type=int, default=2)
    parser.add_argument(
        "--train_dir",
        type=str,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
    )
    parser.add_argument("--ext", type=str, default='.txt')

    args = parser.parse_args()

    fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")]

    # Initialize a tokenizer
    if args.bpe_tokenizer == 'byte_level':
        _BPE_TOKENIZER = ByteLevelBPETokenizer()
    if args.bpe_tokenizer == 'char':
        _BPE_TOKENIZER = CharBPETokenizer()
    if args.bpe_tokenizer == 'sentencepiece':
        _BPE_TOKENIZER = SentencePieceBPETokenizer()

    tokenizer = _BPE_TOKENIZER

    # Customize training
    tokenizer.train(files=fnames,
                    vocab_size=args.vocab_size,
                    min_frequency=args.min_frequency,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    # Save files to disk
    tokenizer.save_model(args.output_dir)

    #test
    tokenizer = CamembertTokenizer.from_pretrained(args.output_dir)
    print(tokenizer.encode_plus('สวัสดีครับ hello world'))
示例#15
0
    def __init__(self, sentences, NUM_CLASS, seed_val=42, random_state=2018, evaluate_score=flat_accuracy):

      self.seed_val = seed_val
      self.random_state = random_state
      self.evaluate_score = evaluate_score
      self.NUM_CLASS = NUM_CLASS

      print('Loading CamemBERT tokenizer...') # Load the CamemBERT tokenizer.
      tokenizer = CamembertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

      print('Set max_length as: ', min(512, np.max(np.array([len(tokenizer.encode(i, add_special_tokens=True)) for i in sentences]))) )
示例#16
0
 def __init__(self, DIR, filename):
     self.path = os.path.join(DIR, filename)
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.tokenizer = CamembertTokenizer.from_pretrained(
         config["BERT_MODEL"])
     classifier = CamembertForSequenceClassification.from_pretrained(
         config['BERT_MODEL'], num_labels=len(config["CLASS_NAMES"]))
     classifier.load_state_dict(
         torch.load(self.path, map_location=self.device))
     classifier = classifier.eval()
     self.classifier = classifier.to(self.device)
示例#17
0
 def __init__(self):
     self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
     self.target_tokenizer = CamembertTokenizer
     self.tokenizer = CamembertTokenizer.from_pretrained(self.model_name,
                                                         revision='main')
     self.tokenizer.additional_special_tokens = [
         '<s>NOTUSED', '</s>NOTUSED', '<_>'
     ]
     self.fill_mask = pipeline(task='fill-mask',
                               tokenizer=self.tokenizer,
                               model=f'{self.model_name}',
                               revision='main')
     self.MASK_TOKEN = self.tokenizer.mask_token
示例#18
0
    def __init__(
            self,
            arguments_service: PretrainedArgumentsService):
        super().__init__()

        pretrained_weights = arguments_service.pretrained_weights
        configuration = arguments_service.configuration

        self._arguments_service = arguments_service

        self._tokenizer: CamembertTokenizer = CamembertTokenizer.from_pretrained(pretrained_weights)
        self._sign_tokens = [',', '.', ';']
        self._subword_prefix_symbol = '▁'
示例#19
0
    def __init__(self, model_dir_or_name: str, layers: str = '-1',
                 pooled_cls: bool = False):
        super().__init__()

        self.tokenzier = CamembertTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = CamembertForTokenClassification.from_pretrained(
            model_dir_or_name)

        self._cls_index = self.tokenzier.encoder['<s>']
        self._sep_index = self.tokenzier.encoder['</s>']
        self._wordpiece_pad_index = self.tokenzier.encoder['<pad>']
        self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
        self.pooled_cls = pooled_cls
示例#20
0
def get_train_valid_test_fine(bert_model, max_seq_lenght):
    tokenizer = CamembertTokenizer.from_pretrained(bert_model)
    csv_train = pd.read_csv("../data/fine_train.csv")
    train, valid = train_test_split(csv_train, test_size=0.2)
    test = pd.read_csv("../data/fine_test.csv")

    train_dl = _get_dataloader(train, tokenizer, max_seq_lenght)
    valid_dl = _get_dataloader(valid,
                               tokenizer,
                               max_seq_lenght,
                               drop_last=False)
    test_dl = _get_dataloader(test, tokenizer, max_seq_lenght, drop_last=False)

    return train_dl, valid_dl, test_dl
示例#21
0
def evaluate():
    model = load_model(save_model_location)
    tokenizer = CamembertTokenizer.from_pretrained(tokenizer_location)
    testloader = load_data()
    softmax = torch.nn.Softmax(dim=1)
    iter_loader = iter(testloader)
    correct = 0
    tp, tn, fp, fn = 0, 0, 0, 0  # true-positive, true-negative, false-positive, false-negatives
    print("Starting Evaluation")
    total = 0
    for data in tqdm(iter_loader):

        data["sentence"] = tokenizer(data["sentence"],
                                     padding=True,
                                     max_length=512)
        data["sentence"]["input_ids"] = list(
            map(lambda x: x[:512], data["sentence"]["input_ids"]))
        data["sentence"]["attention_mask"] = list(
            map(lambda x: x[:512], data["sentence"]["attention_mask"]))
        data["sentence"]["input_ids"] = torch.tensor(
            data["sentence"]["input_ids"], dtype=torch.long, device=cuda0)
        data["sentence"]["attention_mask"] = torch.tensor(
            data["sentence"]["attention_mask"], device=cuda0)

        output = model(data["sentence"]["input_ids"],
                       data["sentence"]["attention_mask"])

        # For all data in 1 batch (Here 2 datasets are present in a single batch)
        for i in range(len(data["label"])):
            total += 1
            output = softmax(output)
            actual = data["label"][i].item()
            pred = torch.argmax(output[i]).item()
            if pred == actual:
                correct += 1
            if actual:  # if 1
                if pred:
                    tp += 1
                else:
                    fn += 1
            else:
                if pred:
                    fp += 1
                else:
                    tn += 1

    print("Percentage of correct predictions: {}".format(
        (correct / total * 100.0)))
    print("F-0.5 value is {}".format(calculate_f_beta(tp, fp, tn, fn)))
示例#22
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(CamemBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 511:
            logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511")
            max_seq_length = 511
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
    def __init__(self,
                 model_name: str,
                 num_labels: int = None,
                 max_length: int = None,
                 device: str = None,
                 tokenizer_args: Dict = {}):
        """
        A CrossEncoder takes exactly two sentences / texts as input and either predicts
        a score or label for this sentence pair. It can for example predict the similarity of the sentence pair
        on a scale of 0 ... 1.

        It does not yield a sentence embedding and does not work for individually sentences.

        :param model_name: Any model name from Huggingface Models Repository that can be loaded with AutoModel. We provide several pre-trained CrossEncoder models that can be used for common tasks
        :param num_labels: Number of labels of the classifier. If 1, the CrossEncoder is a regression model that outputs a continous score 0...1. If > 1, it output several scores that can be soft-maxed to get probability scores for the different classes.
        :param max_length: Max length for input sequences. Longer sequences will be truncated. If None, max length of the model will be used
        :param device: Device that should be used for the model. If None, it will use CUDA if available.
        :param tokenizer_args: Arguments passed to AutoTokenizer
        """

        self.config = AutoConfig.from_pretrained(model_name)
        classifier_trained = True
        if self.config.architectures is not None:
            classifier_trained = any([
                arch.endswith('ForSequenceClassification')
                for arch in self.config.architectures
            ])

        if num_labels is None and not classifier_trained:
            num_labels = 1

        if num_labels is not None:
            self.config.num_labels = num_labels

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, config=self.config)
        #         self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_args)
        self.tokenizer = CamembertTokenizer.from_pretrained(
            model_name, **tokenizer_args)

        self.max_length = max_length

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info("Use pytorch device: {}".format(device))

        self._target_device = torch.device(device)
示例#24
0
    def init_nlp(self, model_path="model_nlp.pt"):
        try:
            nlp = spacy.load("fr_core_news_sm")
        except:
            os.system("python -m spacy download fr")
            os.system("python -m spacy download fr_core_news_md")

        # load model camembert
        state_dict = torch.load(model_path, map_location=torch.device('cpu'))
        #print("Loading trained model...")
        model = CamembertForSequenceClassification.from_pretrained(
            'camembert-base', num_labels=2, state_dict=state_dict)
        #print("Trained model loaded!")

        # load TOKENIZER camembert
        TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base',
                                                       do_lower_case=True)
        return model, TOKENIZER
示例#25
0
    def create(cls,
               data_file,
               image_dir,
               transform,
               labels_path,
               pad_idx=0,
               tokenizer=None,
               model_type=None,
               min_char_len=1,
               max_seq_length=510,
               model_name="camembert-base",
               clear_cache=False,
               is_cls=True):
        if tokenizer is None:
            if 'camem' in model_type:
                tokenizer = CamembertTokenizer.from_pretrained(model_name)
            elif 'flaubert' in model_type:
                tokenizer = FlaubertTokenizer.from_pretrained(model_name)
            elif 'XLMRoberta' in model_type:
                tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
            elif 'M-Bert' in model_type:
                tokenizer = BertTokenizer.from_pretrained(model_name)

        with open(data_file, 'rb') as f:
            data = pickle.load(f)

        # data =  data_file

        idx2labels, labels2idx = cls.create_labels(labels_path)
        config = {
            "min_char_len": min_char_len,
            "model_name": model_name,
            "max_sequence_length": max_seq_length,
            "clear_cache": clear_cache,
            "pad_idx": pad_idx,
            "is_cls": is_cls,
            "idx2labels": idx2labels,
            "labels2idx": labels2idx
        }

        self = cls(data, image_dir, transform, tokenizer, config)

        return self
示例#26
0
 def __init__(
     self,
     pretrained_embedding=None,
     architecture_function=None,
     text_input_column="clean_text",
     meta_input_list=["extension", "dayofweek", "hour", "min"],
     vocab_size=25000,
     seq_size=100,
     embedding_dim=200,
     loss="categorical_crossentropy",
     activation="softmax",
     batch_size=4096,
     n_epochs=15,
     bert_tokenizer="jplu/tf-camembert-base",
     bert_model="jplu/tf-camembert-base",
     **kwargs,
 ):
     self.architecture_function = architecture_function
     self.pretrained_embedding = pretrained_embedding
     if self.architecture_function.__name__ != "bert_model":
         self.tokenizer = Tokenizer(input_column=text_input_column)
     elif "camembert" in bert_tokenizer.lower():
         self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer)
     elif "flaubert" in bert_tokenizer.lower():
         self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
     else:
         raise NotImplementedError(
             "Bert tokenizer {} not implemented".format(bert_tokenizer))
     self.text_input_column = text_input_column
     self.meta_input_list = meta_input_list
     self.vocab_size = vocab_size
     self.seq_size = seq_size
     self.embedding_dim = embedding_dim
     self.loss = loss
     self.activation = activation
     self.batch_size = batch_size
     self.n_epochs = n_epochs
     self.bert_model = bert_model
     self.nb_labels = 0
     self.nb_meta_features = 0
     self.vocabulary = []
     self.vocabulary_dict = {}
 def sentence_embeddings(self):
     if self.finetuned_bert == False:
         tokenizer = CamembertTokenizer.from_pretrained(
             stg.REGULAR_CAMEMBERT)
         model = CamembertModel.from_pretrained(stg.REGULAR_CAMEMBERT)
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             stg.FINED_TUNED_CAMEMBERT)
         model = CamembertModel.from_pretrained(stg.FINED_TUNED_CAMEMBERT)
     if torch.cuda.is_available() == True:
         print(
             '====== Cuda is Available, GPU will be used for this task ======'
         )
         torch.cuda.empty_cache()
         model.cuda()
         device = torch.device("cuda")
     embedding_all_text = []
     number_sentences = len(self.sentences)
     for i in tqdm(range(0, number_sentences, self.batch_size)):
         if ((i + self.batch_size) < number_sentences):
             batch = self.sentences[i:i + self.batch_size]
             encoded_input = self.get_batch_sentence_tokens(
                 batch, tokenizer)
         elif (i == number_sentences):
             pass
         else:
             batch = self.sentences[i:]
             encoded_input = self.get_batch_sentence_tokens(
                 batch, tokenizer)
         if torch.cuda.is_available() == True:
             encoded_input.to(device)
         with torch.no_grad():
             model_output = model(**encoded_input)
         sentence_embeddings_tensor = self.mean_pooling(
             model_output, encoded_input['attention_mask'])
         embedding_all_text.append(sentence_embeddings_tensor)
         if torch.cuda.is_available() == True:
             del encoded_input
             del sentence_embeddings_tensor
             torch.cuda.empty_cache()
     sentence_embeddings = self.torch_to_array(embedding_all_text)
     return sentence_embeddings
示例#28
0
    def load_model(self, model_path):
        """Load all required models.
        
        model_path: path to model directory"""

        self.sentence_tokenizer = joblib.load(
            os.path.join(model_path, "sentence_tokenizer.joblib"))
        self.word_tokenizer = CamembertTokenizer.from_pretrained(model_path)

        self.sentence_embedding_model = TFCamembertForSentenceEmbedding.from_pretrained(
            model_path)
        model_path = os.path.join(model_path, "sentence_model")

        if os.path.exists(model_path + ".h5"):
            # HDF5 format
            self.sentence_model = tf.keras.models.load_model(model_path +
                                                             ".h5")
        else:
            # SavedModel (TF) format
            self.sentence_model = tf.keras.models.load_model(model_path)
示例#29
0
    def make_tokenizer(self,
                       text,
                       voc_size,
                       prefix,
                       mtype='bpe',
                       name="tokenizer_input"):
        '''
         @path : chemin vers le fichier txt, de format un doc par ligne
         @voc_size : la taille souhaitée sur vocabulaire
         @prefix : le nom que l'on veut donner au modèle
         @mtype : le type de modèle, par exemple 'bpe'
         '''
        os.chdir(self.path)
        path = self.make_input_tokenizer(text, self.path, name)

        if '/' in path:
            chemin = '/'.join([i for i in path.split('/')[:-1]])
            chemin = chemin + '/'
        elif '\\' in path:
            chemin = '\\'.join([i for i in path.split('\\')[:-1]])
            chemin = chemin + "\\"
        else:
            raise ValueError(
                'La fonction ne parvient pas à trouver le chemin pour enregistrer le tokenizer, vérifier le chemin fourni, la variable path'
            )

        import sentencepiece as spm
        #L'input doit être un fichier .txt
        FUES = spm.SentencePieceTrainer.train(
            input=path,  #chemin vers le fichier txt, un doc par ligne
            vocab_size=
            voc_size,  #taille du vocab, peut être augmenté, ne doit pas être trop grand par rapport aux mots des documents
            model_prefix=
            prefix,  #nom du modèle, French Unsupervised Exctractive Summarizer
            model_type=mtype)

        from transformers import CamembertTokenizer
        tokenizer = CamembertTokenizer(chemin + prefix + '.model')
        return tokenizer
示例#30
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(CamemBERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 511:
            logging.warning(
                "CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511"
            )
            max_seq_length = 511
        self.max_seq_length = max_seq_length

        self.camembert = CamembertModel.from_pretrained(model_name_or_path)
        self.tokenizer = CamembertTokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.sep_token])[0]