Пример #1
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from flair.embeddings import FlairEmbeddings
dictionary: Dictionary = Dictionary.load('chars')
#dictionary: Dictionary = language_model.dictionary
language_model = FlairEmbeddings('pubmed-forward').lm
# get your corpus, process forward and at the character level
is_forward_lm = True

corpus = TextCorpus('/content/corpus',
                    dictionary,
                    is_forward_lm,
                    character_level=True)

trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('/content/language_model',
              sequence_length=10,
              mini_batch_size=10,
              max_epochs=10)
search_space.add_optimization_value(param.OptimizationValue.DEV_SCORE)
search_space.add_max_epochs_per_training_run(25)

search_space.add_parameter(param.SequenceTagger.HIDDEN_SIZE,
                           options=[128, 256, 512])
search_space.add_parameter(param.SequenceTagger.DROPOUT,
                           options=[0, 0.1, 0.2, 0.3])
search_space.add_parameter(param.SequenceTagger.WORD_DROPOUT,
                           options=[0, 0.01, 0.05, 0.1])
search_space.add_parameter(param.SequenceTagger.RNN_LAYERS,
                           options=[2, 3, 4, 5, 6])
search_space.add_parameter(param.SequenceTagger.USE_RNN, options=[True, False])
search_space.add_parameter(param.SequenceTagger.USE_CRF, options=[True, False])
search_space.add_parameter(param.SequenceTagger.REPROJECT_EMBEDDINGS,
                           options=[True, False])
search_space.add_word_embeddings(options=[[
    TransformerWordEmbeddings(model="distilbert-base-uncased", batch_size=16),
    FlairEmbeddings("news-backward"),
    WordEmbeddings("glove")
]])

search_strategy.make_configurations(search_space)

orchestrator = orchestrator.Orchestrator(
    corpus=corpus,
    base_path="resources/evaluation_wnut_grid",
    search_space=search_space,
    search_strategy=search_strategy)

orchestrator.optimize()
Пример #3
0
 def post_init(self):
     from flair.embeddings import FlairEmbeddings
     self._flair = FlairEmbeddings(self.model_name)
Пример #4
0
                        help="prompt / sequence length")
    parser.add_argument("--n_epochs", type=int, default=100)
    parser.add_argument("--n_chars", type=int, default=3000,
                        help="number of generated characters")
    parser.add_argument("--ckpt_dir", type=str, default="checkpoints2")
    parser.add_argument("--output_dir", type=str, default="outputs2")
    hp = parser.parse_args()

    if not os.path.exists(hp.ckpt_dir): os.makedirs(hp.ckpt_dir)
    if not os.path.exists(hp.output_dir): os.makedirs(hp.output_dir)

    # device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print("# load existing language model")
    news_forward = FlairEmbeddings('news-forward')
    model = LanguageModel.load_language_model(news_forward)
    model.to(device)

    print("# load input data")
    item2idx = model.dictionary.item2idx
    print(item2idx["\n".encode()])

    inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1]
    inputs = [item2idx.get(char.encode(), 0) for char in inputs]
    inputs = torch.LongTensor(inputs).unsqueeze(-1)  # (seqlen, 1)
    inputs = inputs.to(device)

    print("# load corpus")
    corpus = TextCorpus(Path('corpus/'),
                        model.dictionary,
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from google.colab import files
from datetime import datetime

bert_embedding = BertEmbeddings()
bert_train_document_embeddings = DocumentPoolEmbeddings([bert_embedding])
bert_test_document_embeddings = DocumentPoolEmbeddings([bert_embedding])

#requires allennlp
elmo_embedding = ELMoEmbeddings()
elmo_train_document_embeddings = DocumentPoolEmbeddings([elmo_embedding])
elmo_test_document_embeddings = DocumentPoolEmbeddings([elmo_embedding])

flair_embedding_forward = FlairEmbeddings('news-forward')
flair_train_document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])
flair_test_document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])

uploaded = files.upload()

data = pd.read_csv("finalDataset.csv")

# BERT

for count in range (30):
  start_time = datetime.now()
  train_sentences = []
  train_labels = []
  test_sentences = []
  test_labels = []
Пример #6
0
def get_mixed_bio_embeddings():
    return [
        WordEmbeddings("en-crawl"),
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward")
    ]
Пример #7
0
def get_bio_ft_embeddings():
    return [
        FastTextEmbeddings("data/BioWordVec_PubMed_MIMICIII_d200.bin"),
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward")
    ]
Пример #8
0
    },
    tag_to_bioes="ner",
    skip_first_line=True)
print(corpus)

# 2. what tag do we want to predict?
tag_type = "ner"

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings("de-wiki"),
    FlairEmbeddings("de-impresso-hipe-v1-forward", lowercased_lm=True),
    FlairEmbeddings("de-impresso-hipe-v1-backward", lowercased_lm=True),
    #TransformerWordEmbeddings("/mnt/clef-hipe-parser-master/transformers/examples/token-classification/german-large-2", layers="all", use_scalar_mix=True)
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
Пример #9
0
 def __init__(self, *pres):
     super().__init__(*pres)
     self.flair_embedding_backward = FlairEmbeddings('news-backward')
Пример #10
0
                        help="output folder")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    input_folder = args.data_folder
    embedding = args.embedding
    epochs = args.epochs
    output_folder = args.output_folder
    if (embedding == 'no_char'):
        embedding_types = [
            WordEmbeddings("glove"),
        ]
    elif (embedding == 'char'):
        embedding_types = [
            WordEmbeddings("glove"),
            CharacterEmbeddings(),
        ]
    elif (embedding == 'flair'):
        embedding_types = [WordEmbeddings("glove"),FlairEmbeddings('news-forward-fast'),\
                           FlairEmbeddings('news-backward-fast'),]
    else:
        print(
            'Please choose one fo the following options for the embedding: no_char,char,flair'
        )
    embeddings, tag_dictionary, corpus = load_input_dataset(
        input_folder, embedding_types)
    train_model(output_folder, embeddings, epochs, tag_dictionary, corpus)
Пример #11
0
def get_embedder():
  glove_embedding = WordEmbeddings('glove')
  flair_embedding_forward = FlairEmbeddings('news-forward')
  flair_embedding_backward = FlairEmbeddings('news-backward')
  
  return DocumentPoolEmbeddings([glove_embedding, flair_embedding_backward, flair_embedding_forward], pooling='mean')
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('fr-crawl'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('fr-forward'),
    FlairEmbeddings('fr-backward'),

    # bert embeddings
    # BertEmbeddings('bert-base-french')

    # CCASS Flair Embeddings FWD
    #FlairEmbeddings('/data/embeddings_CCASS/flair_language_model/jurinet/best-lm.pt'),

    # CCASS Flair Embeddings BWD
    #FlairEmbeddings('/data/embeddings_CCASS/flair_language_model/jurinet/best-lm-backward.pt')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
Пример #13
0
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
# modeltrainer.train()
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    WordEmbeddings('glove.gensim'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('news-forward-0.4.1.pt'),
    FlairEmbeddings('news-backward-0.4.1.pt'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-cased')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        base_path=file_path,
        #EvaluationMetric.MACRO_F1_SCORE,
        max_epochs=n_epochs,
        checkpoint=True)

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
Пример #15
0
def test_load_non_existing_flair_embedding():
    with pytest.raises(ValueError):
        FlairEmbeddings("other")
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model (by default model uses one RNN layer).
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of two vector embeddings:
    - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic
      information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any
      explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are
      contextualized by their surrounding text, meaning that the same word will have different embeddings depending on
      its contextual use.
      There is only forward model (that goes through the given on input plain text form left to right) used for part of
      speech (pos) tag training. Backward model (that goes through the given on input plain text form right to left)
      was not used here.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are two One Hot Embeddings used in training:
      - first to embed information about occurrence of separator before token,
      - second to embed information about concatenated with a ';' proposed tags.
    Model and training logs are saved in resources_ex_1/taggers/example-pos directory.
    This is the method where internal state of forward Flair model is taken at the end of each token
    and, supplemented by information about occurrence of separator before token and proposed tags for given token used
    to train model for one of stratified 10 fold cross validation splits.

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        FlairEmbeddings('pl-forward', chars_per_chunk=64),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=1)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train('resources_ex_1/taggers/example-pos/it-' + str(skf_split_no),
                  learning_rate=0.1,
                  mini_batch_size=32,
                  embeddings_storage_mode='gpu',
                  max_epochs=sys.maxsize,
                  monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights('resources_ex_1/taggers/example-pos/it-' +
                         str(skf_split_no) + '/weights.txt')
Пример #17
0
# Import and use corpura accordingly,
# chemicals: HUNER_CHEMICALS
# diseases: HUNER_DISEASE
# genes/proteins: HUNER_GENE
# Species: HUNER_SPECIES

# 1. get all corpora for a specific entity type
from flair.models import SequenceTagger
corpus = HUNER_CELL_LINE()

# 2. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
embedding_types = [
    WordEmbeddings("pubmed"),
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. initialize sequence tagger
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type="ner",
                        use_crf=True,
                        locked_dropout=0.5)
Пример #18
0
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)


word_embeddings = [WordEmbeddings('glove')]
run_splits(word_embeddings, 'glove')

word_embeddings = [FlairEmbeddings('news-forward-fast')]
run_splits(word_embeddings, 'news-forward-fast')

word_embeddings = [
    FlairEmbeddings(
        'data/echr_lm_models/news_forward_fast_finetuned_echr/epoch_2.pt')
]
run_splits(word_embeddings, 'news-forward-fast-finetuned')

word_embeddings = [
    FlairEmbeddings('data/echr_lm_models/echr_language_model/epoch_4.pt')
]
run_splits(word_embeddings, 'flair_echr_13k_lm')

word_embeddings = [TransformerWordEmbeddings('bert-base-cased')]
run_splits(word_embeddings, 'bert-base-cased')
Пример #19
0
def get_bio_embeddings():
    return [
        WordEmbeddings("data/BioWordVec_PubMed_MIMICIII_d200.gensim"),
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward")
    ]
Пример #20
0
 def __init__(self):
     self.embedder = FlairEmbeddings('news-forward-fast')
     self.embedding_length = self.__len__()
Пример #21
0
def get_general_embeddings():
    return [
        WordEmbeddings("en-crawl"),
        FlairEmbeddings("en-forward"),
        FlairEmbeddings("en-backward")
    ]
Пример #22
0
def test_loading_not_existing_char_lm_embedding():
    with pytest.raises(ValueError):
        FlairEmbeddings('other')
Пример #23
0
import os

if not os.path.exists(IMAGE_PATH):
    os.makedirs(IMAGE_PATH)

sts_dev, sts_test = download_and_load_sts_data()
print('Downloaded data')

frequency = load_frequencies("data/frequencies/frequencies.tsv")
doc_frequency = load_doc_frequencies("data/frequencies/doc_frequencies.tsv")
word2vec = load_word2vec(w2v_path)
elmo = ELMoEmbeddings('large')
bert = TransformerWordEmbeddings('bert-large-cased')
flair = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
])
elmo_bert = StackedEmbeddings([elmo, bert])

print("Loaded Resources")

benchmarks = [("AVG-W2V",
               ft.partial(run_avg_benchmark,
                          model=word2vec,
                          use_stoplist=False)),
              ("AVG-ELMO",
               ft.partial(run_context_avg_benchmark,
                          model=elmo,
                          use_stoplist=False)),
              ("AVG-BERT",
Пример #24
0
    def __init__(self,
                 word_embedding_base: str = None,
                 document_embedding: str = None,
                 fine_tune: bool = False,
                 pretuned: bool = False):
        """

        :param word_embedding_base: - glove: 'glove', (only en), - fasttext: 'en', 'de'
        :param document_embedding:  pool vs rnn for w2v mode - bert: 'bert', 'bert-de'  - 'longformer' (only en) -
        'flair', 'stacked-flair', 'flair-de', 'stacked-flair-de'
        """
        # document embedding
        self.fine_tune = fine_tune
        self.document_embedding = None
        if word_embedding_base:
            self.word_embedding_base = WordEmbeddings(word_embedding_base)

            if document_embedding.lower() == 'pool':
                self.document_embedding = DocumentPoolEmbeddings(
                    [self.word_embedding_base])
            elif document_embedding.lower() == 'rnn':
                self.document_embedding = DocumentRNNEmbeddings(
                    [self.word_embedding_base])
            else:
                raise UserWarning(
                    f'{document_embedding} is not supported for combination with word embeedings'
                )
        elif document_embedding:
            print(document_embedding, pretuned)
            if pretuned:
                if document_embedding.lower(
                ) == 'bert' or document_embedding.lower() == 'bert-de':
                    self.document_embedding = SentenceTransformer(
                        'stsb-bert-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-bert-large')
                elif document_embedding.lower() == 'roberta':
                    self.document_embedding = SentenceTransformer(
                        'stsb-roberta-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-roberta-large')
                elif document_embedding.lower() == 'xlm':
                    self.document_embedding = SentenceTransformer(
                        'stsb-xlm-r-multilingual')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-xlm-r-multilingual')
            else:
                if document_embedding.lower() == 'bert':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'bert-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-german-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'longformer':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'allenai/longformer-base-4096', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlnet-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlm-mlm-ende-1024', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair':
                    self.document_embedding = FlairEmbeddings(
                        'en-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair-de':
                    self.document_embedding = FlairEmbeddings(
                        'de-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'stack-flair':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('en-forward'),
                        FlairEmbeddings('en-backward'),
                    ])
                elif document_embedding.lower() == 'stack-flair-de':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('de-forward'),
                        FlairEmbeddings('de-backward'),
                    ])
        else:
            raise UserWarning(f'No embeddings defined')
from flair.models import SequenceTagger

# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]),
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
search_space.add(Parameter.USE_RNN, hp.choice, options=[True])


# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
Пример #26
0
runs = 1
use_glove = True
use_cui2vec = False
use_flair = False
use_elmo = False
use_bert = False
mini_batch_size = 32
word_embeddings = []

if use_glove:
    word_embeddings.append(WordEmbeddings('glove'))
    word_embeddings.append(CharacterEmbeddings())
if use_cui2vec:
    word_embeddings.append(WordEmbeddings('./cui2vec_embed_vectors.bin'))
if use_flair:
    word_embeddings.append(FlairEmbeddings('./forward-lm.pt'))
    word_embeddings.append(FlairEmbeddings('./backward-lm.pt'))
if use_elmo:
    word_embeddings.append(ELMoEmbeddings('pubmed'))
if use_bert:
    word_embeddings.append(BertEmbeddings('./bert-base-clinical-cased'))
    mini_batch_size = 8

stacked_word_embeddings = StackedEmbeddings(word_embeddings)

from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                            rnn_type='LSTM',
                                            bidirectional=True,
                                            hidden_size=512,
from flair.embeddings import Sentence, StackedEmbeddings, FlairEmbeddings
# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('multi-forward')
flair_backward_embedding = FlairEmbeddings('multi-backward')

import numpy as np
import regex as re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

class FlairEmbeddings(object):

	def __init__(self):
		self.stop_words = list(stopwords.words('english'))
		# self.stop_words = []
		self.lemmatizer = WordNetLemmatizer()
		self.stacked_embeddings = StackedEmbeddings(
			embeddings=[flair_forward_embedding, flair_backward_embedding])


	def word_token(self, tokens, lemma=False):
		tokens = str(tokens)
		tokens = re.sub(r"([\w].)([\~\!\@\#\$\%\^\&\*\(\)\-\+\[\]\{\}\/\"\'\:\;])([\s\w].)", "\\1 \\2 \\3", tokens)
		tokens = re.sub(r"\s+", " ", tokens)
		if lemma:
			return " ".join([self.lemmatizer.lemmatize(token, 'v') for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha()])
		else:
			return " ".join([token for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha()])
Пример #28
0
    WordEmbeddings,
    TokenEmbeddings,
    StackedEmbeddings,
    DocumentPoolEmbeddings,
    FlairEmbeddings,
    DocumentRNNEmbeddings,
    DocumentLMEmbeddings,
    TransformerWordEmbeddings,
    TransformerDocumentEmbeddings,
)

from flair.data import Sentence, Dictionary
from flair.models import LanguageModel

glove: TokenEmbeddings = WordEmbeddings("turian")
flair_embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast")


def test_load_non_existing_embedding():
    with pytest.raises(ValueError):
        WordEmbeddings("other")

    with pytest.raises(ValueError):
        WordEmbeddings("not/existing/path/to/embeddings")


def test_load_non_existing_flair_embedding():
    with pytest.raises(ValueError):
        FlairEmbeddings("other")

Пример #29
0
if model_choice == 'FastText (woord)' and start_computation == True:
    with st.spinner('berekenen..'):
        fastext_embedding = WordEmbeddings('nl')
        st.write('model geladen, nu nog de embeddings...')
        embedding_list = [fastext_embedding]
        embeddings = createFlairEmbeddings(embedding_list, data)
        st.write('embeddings binnen, nu mooi maken')
        dataframe = createDataFrame(embeddings, data)
        st.write(dataframe)
        csv = dataframe.to_csv(sep=';')
        st.success('Het is gelukt!')

if model_choice == 'Flair (karakter)' and start_computation == True:

    flair_forward = FlairEmbeddings('nl-forward')
    flair_backward = FlairEmbeddings('nl-backward')
    embedding_list = [flair_forward, flair_backward]
    embeddings = createFlairEmbeddings(embedding_list, data)
    dataframe = createDataFrame(embeddings, data)
    st.write(dataframe)
    csv = dataframe.to_csv(sep=';')

if model_choice == 'RobBERT (zin (RoBERTa))' and start_computation == True:
    modelName = 'pdelobelle/robBERT-base'

    embeddings = createTransformerEmbeddings(modelName, data)

    st.write(embeddings[0])
    print(embeddings[0][0][0])
    list_embeddings = []
Пример #30
0
    "resources/data/",
    column_format,
    train_file="train_biofid.conll",
    in_memory=True,
    tag_to_bioes=list(column_format.values())[3:],
    comment_symbol="#",
    tags_to_keep=['Animal_Fauna', 'Plant_Flora', 'Taxon'])
tag_dictionaries = corpus.make_tag_dictionary()
for tag, tag_dictionary in tag_dictionaries.items():
    print(tag, str(tag_dictionary))

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('de-wiki'),
    BytePairEmbeddings('de', 100, 5000),
    CharacterEmbeddings(),
    FlairEmbeddings('de-forward'),
    FlairEmbeddings('de-backward')
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

taggers: Dict[str, SequenceTagger] = {}
tag_dictionary_items = list(tag_dictionaries.items())
for tag, tag_dictionary in tag_dictionary_items:
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag,
                                            use_crf=True)
    taggers.update({tag: tagger})

    # share parameters