def test_fasttext_embeddings(self): # First we will add smaller test embeddings to the MODELS['ddt.swv'] = { 'url': 'https://danlp.alexandra.dk/304bd159d5de/tests/ddt.swv.zip', 'vocab_size': 5000, 'dimensions': 100, 'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf', 'size': 741125088, 'file_extension': '.bin' } AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv') download_model('ddt.swv', process_func=_unzip_process_func) fasttext_embeddings = load_wv_with_gensim('ddt.swv') self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors) # The word is not in the vocab self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab) # However we can get an embedding because of subword units self.assertEqual( fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ Loads a spacy model. OBS vectorError is a TEMP ugly work around error encounted by keeping two models an not been able to find referece name for vectros """ from spacy.util import load_model_from_path if textcat == None or vectorError == True: modelname = 'spacy' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) if textcat == 'sentiment': modelname = 'spacy.sentiment' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) # quick fix from not aligned models storage: import os model_weight_path = os.path.join(model_weight_path, 'spacy.sentiment') nlp = load_model_from_path(model_weight_path) return nlp
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_emotion = download_model('bert.emotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_emotion = os.path.join(path_emotion,'bert.emotion') path_reject = download_model('bert.noemotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_reject = os.path.join(path_reject,'bert.noemotion') # load the class names mapping self.catagories = { 0: 'Glæde/Sindsro',1: 'Tillid/Accept', 2: 'Forventning/Interrese', 3: 'Overasket/Målløs',4: 'Vrede/Irritation', 5: 'Foragt/Modvilje', 6: 'Sorg/trist',7: 'Frygt/Bekymret'} self.labels_no = {1: 'No emotion', 0: 'Emotional'} # load the models self.tokenizer_reject = BertTokenizer.from_pretrained(path_reject) self.model_reject = BertForSequenceClassification.from_pretrained(path_reject, num_labels=len(self.labels_no.keys())) self.tokenizer = BertTokenizer.from_pretrained(path_emotion) self.model = BertForSequenceClassification.from_pretrained(path_emotion, num_labels=len(self.catagories.keys())) # save embbeding dim, to later ensure the sequenze is no longer the embeddings self.max_length = self.model.bert.embeddings.position_embeddings.num_embeddings self.max_length_reject = self.model_reject.bert.embeddings.position_embeddings.num_embeddings
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_emotion = download_model('bert.emotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_emotion = os.path.join(path_emotion,'bert.emotion') path_reject = download_model('bert.noemotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_reject = os.path.join(path_reject,'bert.noemotion') # load the models self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject) self.model_reject = BertForSequenceClassification.from_pretrained(path_reject) self.tokenizer = BertTokenizer.from_pretrained(path_emotion) self.model = BertForSequenceClassification.from_pretrained(path_emotion) # load the class names mapping self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese', 0: 'Glæde/Sindsro', 3: 'Overasket/Målløs', 1: 'Tillid/Accept', 4: 'Vrede/Irritation', 6: 'Sorg/trist', 7: 'Frygt/Bekymret'}
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_off = download_model('bert.hatespeech.detection', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_off = os.path.join(path_off, 'bert.hatespeech.detection') path_hate = download_model('bert.hatespeech.classification', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_hate = os.path.join(path_hate, 'bert.hatespeech.classification') self.classes_off = ['NOT', 'OFF'] self.classes_hate = [ 'Særlig opmærksomhed', 'Personangreb', 'Sprogbrug', 'Spam & indhold' ] self.tokenizer_off = BertTokenizer.from_pretrained(path_off) self.model_off = BertForSequenceClassification.from_pretrained( path_off, num_labels=len(self.classes_off)) self.tokenizer_hate = BertTokenizer.from_pretrained(path_hate) self.model_hate = BertForSequenceClassification.from_pretrained( path_hate, num_labels=len(self.classes_hate)) # save embbeding dim, to later ensure the sequenze is no longer the embeddings self.max_length_hate = self.model_hate.bert.embeddings.position_embeddings.num_embeddings self.max_length_off = self.model_off.bert.embeddings.position_embeddings.num_embeddings
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_sub = os.path.join(path_sub, 'bert.sub.v0.0.1') path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_pol = os.path.join(path_pol, 'bert.pol.v0.0.1') self.classes_pol = ['positive', 'neutral', 'negative'] self.classes_sub = ['objective', 'subjective'] self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub) self.model_sub = BertForSequenceClassification.from_pretrained( path_sub, num_labels=len(self.classes_sub)) self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol) self.model_pol = BertForSequenceClassification.from_pretrained( path_pol, num_labels=len(self.classes_pol)) # save embbeding dim, to later ensure the sequenze is no longer the embeddings self.max_length_sub = self.model_sub.bert.embeddings.position_embeddings.num_embeddings self.max_length_pol = self.model_pol.bert.embeddings.position_embeddings.num_embeddings
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose: bool = False): """ Available wordembeddings: - wiki.da.wv - cc.da.wv - conll17.da.wv - news.da.wv - sketchengine.da.wv Available subwordembeddings: - wiki.da.swv - cc.da.swv - sketchengine.da.swv :param pretrained_embedding: :param cache_dir: the directory for storing cached data :param verbose: :return: KeyedVectors or FastTextKeyedVectors """ word_embeddings_available(pretrained_embedding, can_use_subword=True) download_model(pretrained_embedding, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin") if pretrained_embedding.split(".")[-1] == 'wv': return KeyedVectors.load_word2vec_format(wv_path, binary=True) elif pretrained_embedding.split(".")[-1] == 'swv': from gensim.models.fasttext import load_facebook_vectors return load_facebook_vectors(wv_path)
def load_context_embeddings_with_flair(direction='bi', word_embeddings=True, cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ :param bidirectional: :param cache_dir: :param verbose: """ from flair.embeddings import FlairEmbeddings from flair.embeddings import WordEmbeddings from flair.embeddings import StackedEmbeddings embeddings = [] if word_embeddings: fasttext_embedding = WordEmbeddings('da') embeddings.append(fasttext_embedding) if direction == 'bi' or direction == 'fwd': fwd_weight_path = download_model('flair.fwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(fwd_weight_path)) if direction == 'bi' or direction == 'bwd': bwd_weight_path = download_model('flair.bwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(bwd_weight_path)) if len(embeddings) == 1: return embeddings[0] return StackedEmbeddings(embeddings=embeddings)
def load_wv_with_spacy(pretrained_embedding: str, cache_dir: str = DEFAULT_CACHE_DIR, verbose=False): """ :param str pretrained_embedding: :param str cache_dir: the directory for storing cached data :param bool verbose: :return """ import spacy # spaCy does not support subwords word_embeddings_available(pretrained_embedding, can_use_subword=False) spacy_model_dir = os.path.join(cache_dir, pretrained_embedding + ".spacy") if os.path.isdir(spacy_model_dir): # Return spaCy model if spaCy model dir exists return spacy.load(spacy_model_dir) bin_file_path = os.path.join(cache_dir, pretrained_embedding + ".bin") if os.path.isfile(bin_file_path): # Then we do not need to download the model _process_embeddings_for_spacy(bin_file_path[:-4] + ".tmp") else: download_model(pretrained_embedding, cache_dir, _process_embeddings_for_spacy, verbose=True, file_extension='.spacy') return spacy.load(spacy_model_dir)
def setUp(self): # First we will add smaller test embeddings to the MODELS['wiki.da.small.wv'] = { 'url': 'https://danlp.s3.eu-central-1.amazonaws.com/test-models/wiki.da.small.zip', 'vocab_size': 5000, 'dimensions': 300, 'md5_checksum': 'fcaa981a613b325ae4dc61aba235aa82', 'size': 5594508, 'file_extension': '.bin' } AVAILABLE_EMBEDDINGS.append('wiki.da.small.wv') # Lets download the model and unzip it download_model('wiki.da.small.wv', process_func=_unzip_process_func)
def load_context_embeddings_with_flair(direction='bi', word_embeddings=None, cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ Loads contextutal (dynamic) word embeddings with flair. :param str direction: bidirectional 'bi', forward 'fwd' or backward 'bwd' :param word_embedding: :param str cache_dir: the directory for storing cached models :param bool verbose: `True` to increase verbosity """ from flair.embeddings import FlairEmbeddings from flair.embeddings import WordEmbeddings from flair.embeddings import StackedEmbeddings embeddings = [] if word_embeddings is not None: _word_embeddings_available(word_embeddings, can_use_subword=False) download_model(word_embeddings, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, word_embeddings + ".bin") fasttext_embedding = WordEmbeddings(wv_path) embeddings.append(fasttext_embedding) if direction == 'bi' or direction == 'fwd': fwd_weight_path = download_model('flair.fwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(fwd_weight_path)) if direction == 'bi' or direction == 'bwd': bwd_weight_path = download_model('flair.bwd', cache_dir, verbose=verbose, process_func=_unzip_process_func) embeddings.append(FlairEmbeddings(bwd_weight_path)) if len(embeddings) == 1: return embeddings[0] return StackedEmbeddings(embeddings=embeddings)
def setUp(self): # First we will add smaller test embeddings to the MODELS['wiki.da.small.wv'] = { 'url': 'https://danlp.alexandra.dk/304bd159d5de/tests/wiki.da.small.zip', 'vocab_size': 5000, 'dimensions': 300, 'md5_checksum': 'fcaa981a613b325ae4dc61aba235aa82', 'size': 5594508, 'file_extension': '.bin' } AVAILABLE_EMBEDDINGS.append('wiki.da.small.wv') self.embeddings_for_testing = ['wiki.da.small.wv', 'dslreddit.da.wv'] # Lets download the models and unzip it for emb in self.embeddings_for_testing: download_model(emb, process_func=_unzip_process_func)
def test_download(self): model_name = 'xlmr.coref' # Download model beforehand model_path = download_model(model_name, DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) # check if path to model exists self.assertTrue(os.path.exists(model_path))
def test_download(self): # Download model beforehand model_path = download_model('spacy', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) info = spacy.info(model_path) self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner']) self.assertEqual(info['lang'], 'da')
def test_flair_tagger(self): # Download model beforehand download_model('flair.ner', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) print("Downloaded the flair model") # Load the NER tagger using the DaNLP wrapper flair_model = load_flair_ner_model() # Using the flair POS tagger sentence = Sentence( 'jeg hopper på en bil som er rød sammen med Jens-Peter E. Hansen') flair_model.predict(sentence) expected_string = "jeg hopper på en bil som er rød sammen med Jens-Peter <B-PER> E. <I-PER> Hansen <I-PER>" self.assertEqual(sentence.to_tagged_string(), expected_string)
def test_download(self): # Download model beforehand model = 'bert.botxo.pytorch' model_path = download_model(model, DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) # check if path to model excist self.assertTrue(os.path.exists(model_path))
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertForNextSentencePrediction, BertTokenizer # download model self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose) # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.path_model) # Load pre-trained model (weights) self.model = BertForNextSentencePrediction.from_pretrained(self.path_model, output_hidden_states = True, # Whether the model returns all hidden-states. )
def test_flair_tagger(self): # Download model beforehand download_model('flair.pos', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) print("Downloaded the flair model") # Load the POS tagger using the DaNLP wrapper flair_model = load_pos_tagger_with_flair() # Using the flair POS tagger sentence = Sentence( 'jeg hopper på en bil som er rød sammen med Jens-Peter E. Hansen') flair_model.predict(sentence) expected_string = "jeg <PRON> hopper <VERB> på <ADP> en <DET> bil <NOUN> som <ADP> er " \ "<AUX> rød <ADJ> sammen <ADV> med <ADP> Jens-Peter <PROPN> E. <PROPN> Hansen <PROPN>" self.assertEqual(sentence.to_tagged_string(), expected_string)
def test_download(self): # Download model beforehand for model in ['bert.emotion', 'bert.noemotion']: model_path = download_model(model, DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) model_path = os.path.join(model_path, model) # check if path to model excist self.assertTrue(os.path.exists(model_path))
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path model_path = download_model('bert.offensive', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.classes = ['NOT', 'OFF'] self.model = BertForSequenceClassification.from_pretrained( model_path, num_labels=len(self.classes)) self.tokenizer = BertTokenizer.from_pretrained(model_path) self.max_length = self.model.bert.embeddings.position_embeddings.num_embeddings
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import AutoModelForTokenClassification, AutoTokenizer # download the model or load the model path weights_path = download_model('bert.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] self.model = AutoModelForTokenClassification.from_pretrained(weights_path) self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification #download the model or load the model path model_path = download_model('xlmr.ned', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.classes = ['0', '1'] self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path) self.model = XLMRobertaForSequenceClassification.from_pretrained( model_path, num_labels=len(self.classes)) self.max_length = self.model.roberta.embeddings.position_embeddings.num_embeddings - 2
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ Loads a spacy model. """ from spacy.util import load_model_from_path model_weight_path = download_model('spacy', cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) return nlp
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertModel import torch # download model self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose) # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.path_model) # Load pre-trained model (weights) self.model = BertModel.from_pretrained(self.path_model, output_hidden_states = True, # Whether the model returns all hidden-states. ) # Put the model in "evaluation" mode, meaning feed-forward operation. self.model.eval()
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ Loads a spaCy model. :param str cache_dir: the directory for storing cached models :param bool verbose: `True` to increase verbosity :param bool textcat: '`sentiment`' for loading the spaCy sentiment analyser :param bool vectorError: :return: a spaCy model .. warning:: vectorError is a temporary work around error encounted by keeping two models and not been able to find reference name for vectors """ from spacy.util import load_model_from_path if textcat==None or vectorError==True: modelname='spacy' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) if textcat=='sentiment': modelname='spacy.sentiment' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) # quick fix from not aligned models storage: import os model_weight_path = os.path.join(model_weight_path, 'spacy.sentiment') nlp = load_model_from_path(model_weight_path) return nlp
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): # download the model or load the model path model_path = download_model('xlmr.coref', cache_dir, process_func=_unzip_process_func, verbose=verbose) archive = load_archive(model_path) self.config = archive.config prepare_environment(self.config) self.model = archive.model self.dataset_reader = archive.validation_dataset_reader self.predictor = CorefPredictor(model=self.model, dataset_reader=self.dataset_reader)
def load_ner_tagger_with_flair(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ :param cache_dir: :param verbose: :return: """ from flair.models import SequenceTagger model_weight_path = download_model('flair.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) # using the flair model flair_model = SequenceTagger.load(model_weight_path) return flair_model
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose: bool = False): """ Loads word embeddings with Gensim. :param str pretrained_embedding: :param cache_dir: the directory for storing cached data :param bool verbose: `True` to increase verbosity :return: KeyedVectors or FastTextKeyedVectors """ _word_embeddings_available(pretrained_embedding, can_use_subword=True) download_model(pretrained_embedding, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin") if pretrained_embedding.split(".")[-1] == 'wv': return KeyedVectors.load_word2vec_format(wv_path, binary=True) elif pretrained_embedding.split(".")[-1] == 'swv': from gensim.models.fasttext import load_facebook_vectors return load_facebook_vectors(wv_path)
def test_download(self): # Download model beforehand for model in ['bert.subjective', 'bert.polarity']: version = { 'bert.subjective': 'bert.sub.v0.0.1', 'bert.polarity': 'bert.pol.v0.0.1' } model_path = download_model(model, DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True) model_path = os.path.join(model_path, version[model]) # check if path to model excist self.assertTrue(os.path.exists(model_path))
def load_flair_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ Loads a flair model for NER. :param str cache_dir: the directory for storing cached models :param bool verbose: `True` to increase verbosity :return: an NER flair model """ from flair.models import SequenceTagger model_weight_path = download_model('flair.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) # using the flair model flair_model = SequenceTagger.load(model_weight_path) return flair_model