Пример #1
0
 def load_tokenizer(self):
     # Load the tokenizer.
     if self.verbose == True:
         print('Loading {} tokenizer...'.format(self.model_name))
     if self.model_name == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'distilbert':
         self.tokenizer = DistilBertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'albert':
         self.tokenizer = AlbertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'bart':
         self.tokenizer = BartTokenizer.from_pretrained(self.model_type,
                                                        do_lower_case=True)
     if self.model_name == 'xlnet':
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type,
                                                         do_lower_case=True)
     if self.model_name == 'roberta':
         self.tokenizer = RobertaTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'camenbert':
         self.tokenizer = CamembertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'flaubert':
         self.tokenizer = FlaubertTokenizer.from_pretrained(
             self.model_type, do_lower_case=True)
     if self.model_name == 'gpt2':
         self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
Пример #2
0
    def init_model(self, model='flaubert', device=None, log=False):
        # Choosing device for language model
        if device is None:
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        self.device = device

        try:
            # Flaubert model
            if model == 'flaubert':
                model_name = 'flaubert/flaubert_large_cased'
                flaubert = FlaubertModel.from_pretrained(model_name)
                tokenizer = FlaubertTokenizer.from_pretrained(
                    model_name, do_lowercase=False)
                self.model = flaubert
                self.tokenizer = tokenizer
                self.model_name = model_name
            # Camembert model
            elif model == 'camembert':
                model_name = 'camembert'
                self.model = torch.hub.load('pytorch/fairseq', 'camembert')
                self.model_name = model_name
        except:
            print(f'Error while loading the {model} model.')
            return

        # Model Inference
        self.model.to(self.device)
        self.model.eval()

        # Log Info
        if log:
            self.init_log(self.model_name, self.device)
Пример #3
0
    def create(cls,
               data_file,
               image_dir,
               transform,
               labels_path,
               pad_idx=0,
               tokenizer=None,
               model_type=None,
               min_char_len=1,
               max_seq_length=510,
               model_name="camembert-base",
               clear_cache=False,
               is_cls=True):
        if tokenizer is None:
            if 'camem' in model_type:
                tokenizer = CamembertTokenizer.from_pretrained(model_name)
            elif 'flaubert' in model_type:
                tokenizer = FlaubertTokenizer.from_pretrained(model_name)
            elif 'XLMRoberta' in model_type:
                tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
            elif 'M-Bert' in model_type:
                tokenizer = BertTokenizer.from_pretrained(model_name)

        with open(data_file, 'rb') as f:
            data = pickle.load(f)

        # data =  data_file

        idx2labels, labels2idx = cls.create_labels(labels_path)
        config = {
            "min_char_len": min_char_len,
            "model_name": model_name,
            "max_sequence_length": max_seq_length,
            "clear_cache": clear_cache,
            "pad_idx": pad_idx,
            "is_cls": is_cls,
            "idx2labels": idx2labels,
            "labels2idx": labels2idx
        }

        self = cls(data, image_dir, transform, tokenizer, config)

        return self
Пример #4
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if auto_model is None:
         auto_model = ""
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert2" in auto_model:
         from transformers import FlaubertModel, FlaubertTokenizer
         self.auto_embeddings = FlaubertModel.from_pretrained(auto_path)
         self.auto_tokenizer = FlaubertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "roberta" in auto_model:
         from transformers import RobertaModel, RobertaTokenizer
         self.auto_embeddings = RobertaModel.from_pretrained(auto_path)
         self.auto_tokenizer = RobertaTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
Пример #5
0
import torch
from transformers import FlaubertModel, FlaubertTokenizer
from preprocess import preprocess
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

modelname = 'flaubert-base-uncased'

# Load pretrained model and tokenizer
flaubert, log = FlaubertModel.from_pretrained(modelname,
                                              output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname,
                                                       do_lowercase=True)


def get_flo_vec(q):
    query = preprocess(q, lower=True)
    token_ids = torch.tensor([flaubert_tokenizer.encode(query)])
    last_layer = flaubert(token_ids)[0][:, 1:-1, :]
    return last_layer.detach().numpy().mean(axis=1)


def build_flo_mat(titles_processed):
    f_mat = [get_flo_vec(t).squeeze() for t in tqdm(titles_processed)]
    return f_mat


class Predictor():
    def __init__(self, titles):
        self.mat = self._build_flo_mat(titles)
from transformers import FlaubertWithLMHeadModel, FlaubertTokenizer
import torch
from torch.nn import functional as F
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_uncased")
model = FlaubertWithLMHeadModel.from_pretrained(
    "flaubert/flaubert_base_uncased")

handle = open("mask682.txt", "r")
handle = handle.readlines()

fichier = open("score de prediction682.txt", "w")
for line in handle:
    line = line.strip()

    coupe = line.split("**")

    mot = coupe[0]
    phrase = coupe[1]

    sequence = eval(f"f'''{phrase}'''")

    token_ids = tokenizer.encode(sequence, return_tensors='pt')
    mask_token_index = torch.where(token_ids == tokenizer.mask_token_id)[1]

    token_logits = model(token_ids).logits
    softmax = F.softmax(token_logits, dim=-1)
Пример #7
0
def main():
    usage = """<documentation>"""
    parser = argparse.ArgumentParser(
        description=usage, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("input", type=str, help="Path to input tsv file")
    parser.add_argument("output", type=str, help="Path to output")
    args = parser.parse_args()
    fic_input = args.input  #fichier input
    fic_output = args.output  #fichier output

    tableau = open(fic_input, 'r', encoding='utf8')
    next(tableau)  #ignore la 1ère ligne du tableau
    output = open(fic_output, 'w', encoding='utf8')
    output.write("Ligne" + "\t" + "Compositionnel" + "\t"
                 "Base" + "\t" + "Genre base" + "\t" + "Vecteur base" + "\t" +
                 "Collocatif" + "\t" + "Vecteur collocatif" + "\t" + "Phrase" +
                 "\n")  #entête des colonnes du fichier en output

    #chargement de flaubert
    model_id = "flaubert-base-cased"
    tokenizer = FlaubertTokenizer.from_pretrained(model_id,
                                                  do_lower_case=False)
    flaubert = FlaubertModel.from_pretrained(model_id)
    flaubert.eval()

    #parcours du fichier
    cpt = 2  #compteur pour savoir de quelle phrase il s'agit dans le fichier. 1ère phrase = ligne 2 dans le fichier en input.
    for ligne in tableau:
        numero_ligne_phrase = cpt  #compteur de la ligne sur laquelle se trouve la phrase
        decoupage = ligne.split('\t')  #découpage des colonnes à la tabulation
        base = decoupage[0]
        genre_base = decoupage[1]
        nb_base = decoupage[2]
        collocatif = decoupage[3]
        genre_colloc = decoupage[4]
        nb_colloc = decoupage[5]
        lemme_colloc = decoupage[6]
        trait_compositionnel = decoupage[7]
        phrase = decoupage[8]

        #tokenisation avec Flaubert
        id_base = tokenizer.encode(base)[
            1]  #id de la base (id du milieu car entouré par "1" et "1")
        id_collocatif = tokenizer.encode(collocatif)[
            1]  #id du collocatif (id du milieu car entouré par "1" et "1")
        id_phrase = tokenizer.encode(
            phrase
        )  #id dans le vocabulaire de flaubert des tokens de la phrase

        tableau_indice = {
        }  #dictionnaire avec les indices des tokens pour CHAQUE phrase. clé = numéro du token dans la phrase, valeur = id dans le vocabulaire de flaubert
        nb_occurrences = {
        }  #dictionnaire avec les occurrences pour CHAQUE phrase. clé = id dans le vocabulaire de flaubert, valeur = nombre d'occurrence

        #utilisation de pytorch et flaubert sur chaque phrase
        token_ids = torch.tensor(
            [id_phrase])  #création d'une matrice pour chaque phrase
        contextual_vectors = flaubert(token_ids)[
            0]  #calcule des vecteurs contextuels
        contextual_vectors = contextual_vectors.squeeze(
            0)  #On enlève la première dimension
        recovered_tokens = tokenizer.convert_ids_to_tokens(
            id_phrase
        )  #tokens reconstitués(parfois des bouts de tokens, parfois des tokens entiers

        #parcours token par token dans les phrases pour compter le nombre d'occurrence
        for i in range(0, len(id_phrase) - 1):
            id_token = id_phrase[i]
            tableau_indice[i] = id_token
            if id_token in nb_occurrences:
                nb_occurrences[id_token] += 1
            else:
                nb_occurrences[id_token] = 1

        #cas où il n'y a qu'une occurrence de la base et du collocatif
        if nb_occurrences[id_base] == 1 and nb_occurrences[id_collocatif] == 1:
            resultat_colloc = id_collocatif
            resultat_base = id_base
            for tok in tableau_indice.keys():
                if tableau_indice[tok] == id_base:
                    place_tok_un = tok
                elif tableau_indice[tok] == id_collocatif:
                    place_tok_deux = tok

        #cas où une base apparait plusieurs fois dans une phrase
        elif nb_occurrences[
                id_base] > 1:  #si la base apparait plus d'une fois par phrase
            resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche(
                tableau_indice, id_base, id_collocatif, True
            )  #resultat_base contiendra id_base, et resultat_colloc contiendra id_collocatif
        #cas où un collocatif apparait plusieurs fois
        elif nb_occurrences[
                id_collocatif] > 1:  #si le collocatif apparait plus d'une fois par phrase
            resultat_base, resultat_colloc, place_tok_un, place_tok_deux = base_collocatif_plus_proche(
                tableau_indice, id_collocatif, id_base, False
            )  #resultat_base contiendra id_collocatif, et resultat_colloc contiendra id_base
        for i in range(0, len(recovered_tokens) - 1):
            if i == place_tok_un:  #si le token lu est égal à la base/collocatif de la phrase
                # ~ tok_un = recovered_tokens[i] #token 1 avec découpage de Flaubert
                vecteur_tok_un = contextual_vectors[
                    i]  #on récupère le vecteur du token lu
                tok_lu_un = base
            if i == place_tok_deux:  #si le token lu est égal à la base/collocatif de la phrase
                # ~ tok_deux = recovered_tokens[i] #token 2 avec découpage Flaubert
                vecteur_tok_deux = contextual_vectors[
                    i]  #on récupère le vecteur du token lu
                tok_lu_deux = collocatif
        #écriture du numéro de la ligne, token1, vecteur token1, token2, vecteur token2 et phrase entière
        output.write(
            str(numero_ligne_phrase) + "\t" + trait_compositionnel + "\t" +
            tok_lu_un + "\t" + genre_base + "\t" +
            " ".join(map(str, vecteur_tok_un.numpy())) + "\t" + tok_lu_deux +
            "\t" + " ".join(map(str, vecteur_tok_deux.numpy())) + "\t" +
            phrase + "\n")
        cpt += 1

    output.close()
Пример #8
0
import bson
import numpy as np
import pymongo
from tqdm import tqdm
from transformers import FlaubertTokenizer

from utils import post_token_classification
from flaubert_token_classification import TFFlaubertForTokenClassification

model = TFFlaubertForTokenClassification.from_pretrained("../models/ner")
tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")
SEQUENCE_LENGTH = 64
ner_labels = ["LOC", "MISC", "ORG", "PER", "O"]

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["bert_clustering"]

articles = db["articles"].find()

for article in tqdm(articles):
    sentence = tokenizer.tokenize(article["raw"])
    input_tokens = tokenizer.encode_plus(
        article["raw"],
        max_length=SEQUENCE_LENGTH,
        pad_to_max_length=SEQUENCE_LENGTH,
        add_special_tokens=True,
        return_tensors='tf',
        return_token_type_ids=True,
        return_attention_mask=True,
    )
Пример #9
0
import pandas as pd
from Embedder import getContextualEmbedding, concatEmbeddingEn
from transformers import BertModel
from transformers import FlaubertModel
from transformers import BertTokenizer
from transformers import FlaubertTokenizer
import os

LANG = "EN"
if LANG == "FR":
    tokenizer = FlaubertTokenizer.from_pretrained(
        'flaubert/flaubert_base_cased', do_lowercase=False)
    model, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased',
                                               output_loading_info=True)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lowercase=False)
    model = BertModel.from_pretrained('bert-base-cased')

my_file = open("wiki.dump", "r")
content = my_file.read()
my_file.close()
dfWiki = pd.DataFrame()
number = len(content.split())
i = 0

print("Start !", flush=True)
p = content.split('\n')
print("{} articles to processed".format(len(p)), flush=True)
for sentence in p:
    for sent in range(len(sentence.split()) - 500):
Пример #10
0
import numpy as np
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import os
import random
import wandb
from transformers import FlaubertModel, FlaubertTokenizer
from pytorchtools import EarlyStopping
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"



model_id = "flaubert/flaubert_base_uncased"
tokenizer = FlaubertTokenizer.from_pretrained(model_id, do_lower_case=False)
flaubert = FlaubertModel.from_pretrained(model_id, output_hidden_states=True)


wandb.init(project="FNN")
wandb.watch_called = False
config = wandb.config

# les paramères
# dim_input =3072 #con4couches
dim_input = 768
dim_hidden = 100
config.epochs = 100
patience = 20
config.seed = 42
Пример #11
0
for i in range(len(description)):
    document = " ".join(description[i])
    inputs.append(document)

# segmentation of sentences for each document
nlp = spacy.load("fr_core_news_sm")
doc_sent = []
for i in range(len(inputs)):
    doc = nlp(inputs[i])
    sentences = [sent.text for sent in doc.sents]
    doc_sent.append(sentences)

# define tokenizer of sentences
MAX_LEN = 512
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased',
                                              padding=True,
                                              truncation=True)


def get_tokenized(documents, label):
    tokenizer_out = tokenizer(documents,
                              add_special_tokens=True,
                              max_length=MAX_LEN,
                              return_token_type_ids=False,
                              padding='max_length',
                              return_attention_mask=True,
                              return_tensors='pt',
                              truncation=True)
    label = torch.tensor(label, dtype=torch.long)
    # tokenizer_out est un dictionnaire qui contient 2 clés: input_ids et attention_mask
    return tokenizer_out, label  # on renvoie un tuple à 2 éléments
Пример #12
0
def parse_dawt(label_map, max_seq_length=64, pad_token_label_id=0):
    sample = []
    labels = []
    label_counts = {key: 0 for key, _ in label_map.items()}

    annotations = open(
        "../../opendata/wiki_annotation/wiki_annotations_json_sample_fr")
    tokenizer = FlaubertTokenizer.from_pretrained(
        "jplu/tf-flaubert-base-cased")

    for annotation in tqdm(annotations):
        a = json.loads(annotation)

        if "entities" not in a or "tokens" not in a:
            continue

        entities = a["entities"]
        tokens = a["tokens"]
        del a

        # Add entities to tokens
        for entity in entities:
            i = entity["start_position"]
            token = tokens[i]

            if "type" not in entity:
                continue

            entity.pop("id_str", None)
            if "entity" not in token and (entity["end_position"] -
                                          entity["start_position"] == 0):
                token["entity"] = entity
            i += 1

        word_tokens = []
        label_ids = []

        for idx, token in enumerate(tokens):
            word = token["raw_form"]
            label = token["entity"]["type"] if "entity" in token else 'O'

            if idx != len(tokens) and word.lower() in [
                    "l", "d", "s", "t", "n", "j", "m", "n"
            ]:
                word += "\'"
                tokens[idx]["raw_form"] = word

            word_token = tokenizer.tokenize(word)
            word_tokens.extend(word_token)
            label_ids.extend([label_map[label]] + [0] * (len(word_token) - 1))
            label_counts[label] += 1 * len(word_token)

            if "section_break" in token:
                word_tokens = [tokenizer.cls_token
                               ] + word_tokens + [tokenizer.sep_token]

                padding_length = max_seq_length - len(word_tokens)

                label_ids = [pad_token_label_id
                             ] + label_ids + [pad_token_label_id]
                input_ids = tokenizer.convert_tokens_to_ids(
                    word_tokens + [tokenizer.pad_token] * padding_length)
                label_ids += [pad_token_label_id] * padding_length
                sample.append(input_ids[:max_seq_length])
                labels.append(label_ids[:max_seq_length])
                word_tokens = []
                label_ids = []

    return sample, labels, label_counts


# if __name__ == "__main__":
#     s, l = parse_dawt()
#     print(s[0])
#     print(l[0])
Пример #13
0
       	    else:
       	       	model_name = "bert-base-uncased"
            tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
       	elif language == "el":
            if args.cased:
                print("model not available")
            else:
                model_name = "nlpaueb/bert-base-greek-uncased-v1"
            tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

        elif language == "fr":
            if args.cased:
                print("model not available")
            else:
                model_name = "flaubert/flaubert_base_uncased"
            tokenizer = FlaubertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

        elif language == "es":
            if args.cased:
                print("model not available")
            else:
                model_name = "dccuchile/bert-base-spanish-wwm-uncased"
            tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)

    
    torch.set_default_tensor_type(torch.cuda.FloatTensor)

    torch.manual_seed(0)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    batch_size = 1
    toplayer = 13