Exemplo n.º 1
0
    def load(model: str):
        model_file = None
        # news-english-forward
        if model.lower() == 'ner':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/ner-conll03.pt'
            model_file = cached_path(base_path)

        if model.lower() == 'chunk':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/chunk-conll2000.pt'
            model_file = cached_path(base_path)

        if model_file is not None:
            tagger: SequenceTaggerLSTM = torch.load(model_file, map_location={'cuda:0': 'cpu'})
            tagger.eval()
            if torch.cuda.is_available():
                tagger = tagger.cuda()
            return tagger
Exemplo n.º 2
0
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = True,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        wnut_path = "https://noisy-text.github.io/2017/files/"
        cached_path(f"{wnut_path}wnut17train.conll",
                    Path("datasets") / dataset_name)
        cached_path(f"{wnut_path}emerging.dev.conll",
                    Path("datasets") / dataset_name)
        cached_path(f"{wnut_path}emerging.test.annotated",
                    Path("datasets") / dataset_name)

        super(WNUT_17, self).__init__(data_folder,
                                      columns,
                                      tag_to_bioes=tag_to_bioes,
                                      in_memory=in_memory)
Exemplo n.º 3
0
    def __init__(self,
                 base_path: Union[str, Path] = None,
                 in_memory: bool = True):

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master"
        cached_path(f"{ud_path}/bg_btb-ud-dev.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{ud_path}/bg_btb-ud-test.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{ud_path}/bg_btb-ud-train.conllu",
                    Path("datasets") / dataset_name)

        super(UD_BULGARIAN, self).__init__(data_folder, in_memory=in_memory)
Exemplo n.º 4
0
    def __init__(self,
                 base_path: Union[str, Path] = None,
                 in_memory: bool = True):

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master"
        cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{web_path}/lzh_kyoto-ud-test.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{web_path}/lzh_kyoto-ud-train.conllu",
                    Path("datasets") / dataset_name)

        super(UD_CHINESE_KYOTO, self).__init__(data_folder,
                                               in_memory=in_memory)
Exemplo n.º 5
0
    def __init__(
            self,
            base_path: Union[str, Path] = None,
            tag_to_bioes: str = "ner",
            in_memory: bool = True,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {1: 'text', 3: 'pos', 9: 'ner'}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        data_path = Path(flair.cache_root) / "datasets" / dataset_name
        train_data_file = data_path / "ddt.train.conllu"
        if not train_data_file.is_file():
            temp_file = cached_path(
                'https://danlp.s3.eu-central-1.amazonaws.com/datasets/ddt.zip',
                Path("datasets") / dataset_name
            )
            from zipfile import ZipFile

            with ZipFile(temp_file, 'r') as zip_file:
                zip_file.extractall(path=data_path)

            # Remove CoNLL-U meta information in the last column
            for part in ['train', 'dev', 'test']:
                lines = []
                data_file = "ddt.{}.conllu".format(part)
                with open(data_path / data_file, 'r') as file:
                    for line in file:
                        if line.startswith("#") or line == "\n":
                            lines.append(line)
                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))

                with open(data_path / data_file, 'w') as file:
                    file.writelines(lines)

                print(data_path / data_file)

        super(DANE, self).__init__(
            data_folder, columns, tag_to_bioes=tag_to_bioes,
            in_memory=in_memory, comment_symbol="#"
        )
Exemplo n.º 6
0
    def __init__(
            self,
            base_path: Union[str, Path] = None,
            in_memory: bool = True,
            sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        self.sentence_splitter = sentence_splitter

        # this dataset name
        dataset_name = self.__class__.__name__.lower() + "_" + type(
            self.sentence_splitter).__name__

        # default dataset folder is the cache root
        if not base_path:
            base_path = flair.cache_root / "datasets"
        data_folder = base_path / dataset_name

        drugprot_url = (
            "https://zenodo.org/record/5042151/files/drugprot-gs-training-development.zip"
        )
        data_file = data_folder / "drugprot-train.conllu"

        if not data_file.is_file():
            source_data_folder = data_folder / "original"
            cached_path(drugprot_url, source_data_folder)
            self.extract_and_convert_to_conllu(
                data_file=source_data_folder /
                "drugprot-gs-training-development.zip",
                data_folder=data_folder,
            )

        super(DrugProt, self).__init__(
            data_folder,
            in_memory=in_memory,
            sample_missing_splits=False,
        )
Exemplo n.º 7
0
    def load(model: str):
        model_file = None

        if model.lower() == 'ner':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/ner-conll03.pt'
            model_file = cached_path(base_path)

        if model.lower() == 'chunk':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/chunk-conll2000.pt'
            model_file = cached_path(base_path)

        if model.lower() == 'pos':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/pos-ontonotes-small.pt'
            model_file = cached_path(base_path)

        if model_file is not None:
            tagger: SequenceTaggerLSTM = torch.load(
                model_file, map_location={'cuda:0': 'cpu'})
            tagger.eval()
            if torch.cuda.is_available():
                tagger = tagger.cuda()
            return tagger
Exemplo n.º 8
0
    def load(model: str):
        model_file = None
        aws_resource_path = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        )
        cache_dir = Path("models")

        if model.lower() == "de-offensive-language":
            base_path = "/".join([
                aws_resource_path,
                "TEXT-CLASSIFICATION_germ-eval-2018_task-1",
                "germ-eval-2018-task-1.pt",
            ])
            model_file = cached_path(base_path, cache_dir=cache_dir)

        elif model.lower() == "en-sentiment":
            base_path = "/".join(
                [aws_resource_path, "TEXT-CLASSIFICATION_imdb", "imdb.pt"])
            model_file = cached_path(base_path, cache_dir=cache_dir)

        if model_file is not None:
            return TextClassifier.load_from_file(model_file)
Exemplo n.º 9
0
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = False,
    ):
        """
        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
        download the dataset.
        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
        to point to a different folder but typically this should not be necessary.
        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
        """

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)

        super(LER_GERMAN, self).__init__(data_folder,
                                         columns,
                                         tag_to_bioes=tag_to_bioes,
                                         in_memory=in_memory,
                                         train_file='ler.conll')
Exemplo n.º 10
0
    def _fetch_model(model_name) -> str:

        model_map = {}

        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["relations-fast"] = "/".join([hu_path, "relations-fast", "relations-fast.pt"])
        model_map["relations"] = "/".join([hu_path, "relations", "relations.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        return model_name
Exemplo n.º 11
0
 def _fetch_model(model_name) -> str:
     model_map = {}
     aws_resource_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4'
     model_map['de-offensive-language'] = '/'.join([
         aws_resource_path, 'TEXT-CLASSIFICATION_germ-eval-2018_task-1',
         'germ-eval-2018-task-1.pt'
     ])
     model_map['en-sentiment'] = '/'.join(
         [aws_resource_path, 'TEXT-CLASSIFICATION_imdb', 'imdb.pt'])
     cache_dir = Path('models')
     if (model_name in model_map):
         model_name = cached_path(model_map[model_name],
                                  cache_dir=cache_dir)
     return model_name
Exemplo n.º 12
0
    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master"
        cached_path(
            f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name
        )
        cached_path(
            f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name
        )

        super(UD_NORTH_SAMI, self).__init__(data_folder, in_memory=in_memory)
Exemplo n.º 13
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["tars-base"] = "/".join(
            [hu_path, "tars-base", "tars-base-v8.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name],
                                     cache_dir=cache_dir)

        return model_name
    def __init__(
        self,
        embeddings: str,
        use_local: bool = True,
        use_gensim: bool = False,
        field: str = None,
    ):
        """
        Initializes fasttext word embeddings. Constructor downloads required embedding file and stores in cache
        if use_local is False.

        :param embeddings: path to your embeddings '.bin' file
        :param use_local: set this to False if you are using embeddings from a remote source
        :param use_gensim: set this to true if your fasttext embedding is trained with fasttext version below 0.9.1
        """

        cache_dir = Path("embeddings")

        if use_local:
            if not Path(embeddings).exists():
                raise ValueError(
                    f'The given embeddings "{embeddings}" is not available or is not a valid path.'
                )
        else:
            embeddings = cached_path(f"{embeddings}", cache_dir=cache_dir)

        self.embeddings = embeddings

        self.name: str = str(embeddings)

        self.static_embeddings = True

        self.use_gensim = use_gensim

        if use_gensim:
            self.precomputed_word_embeddings = gensim.models.FastText.load_fasttext_format(
                str(embeddings))
            self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
        else:
            self.precomputed_word_embeddings = ft.load_model(str(embeddings))
            self.__embedding_length: int = self.precomputed_word_embeddings.get_dimension(
            )

        self.field = field
        super().__init__()
Exemplo n.º 15
0
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = True,
        document_as_sequence: bool = False,
    ):
        """
        Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
        download the dataset.
        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
        to point to a different folder but typically this should not be necessary.
        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
        POS tags instead
        :param in_memory: If True, keeps dataset in memory giving speedups in training.
        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
        """
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "pos", 2: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
        cached_path(f"{conll_02_path}ned.testa",
                    Path("datasets") / dataset_name)
        cached_path(f"{conll_02_path}ned.testb",
                    Path("datasets") / dataset_name)
        cached_path(f"{conll_02_path}ned.train",
                    Path("datasets") / dataset_name)

        super(CONLL_03_DUTCH, self).__init__(
            data_folder,
            columns,
            tag_to_bioes=tag_to_bioes,
            encoding="latin-1",
            in_memory=in_memory,
            document_separator_token=None
            if not document_as_sequence else "-DOCSTART-",
        )
Exemplo n.º 16
0
    def __init__(self,
                 base_path: Union[str, Path] = None,
                 in_memory: bool = False):

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        ud_path = (
            "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev"
        )
        cached_path(f"{ud_path}/de_hdt-ud-dev.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{ud_path}/de_hdt-ud-test.conllu",
                    Path("datasets") / dataset_name)

        train_filenames = [
            "de_hdt-ud-train-a-1.conllu",
            "de_hdt-ud-train-a-2.conllu",
            "de_hdt-ud-train-b-1.conllu",
            "de_hdt-ud-train-b-2.conllu",
        ]

        for train_file in train_filenames:
            cached_path(f"{ud_path}/{train_file}",
                        Path("datasets") / dataset_name / "original")

        data_path = Path(flair.cache_root) / "datasets" / dataset_name

        new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu"

        if not new_train_file.is_file():
            with open(new_train_file, "wt") as f_out:
                for train_filename in train_filenames:
                    with open(data_path / "original" / train_filename,
                              "rt") as f_in:
                        f_out.write(f_in.read())

        super(UD_GERMAN_HDT, self).__init__(data_folder, in_memory=in_memory)
Exemplo n.º 17
0
def evaluate(test_file, model_file, dataset_format='macss', semeval_scoring=False):
    if semeval_scoring:
        eval_script = cached_path(
            'https://raw.githubusercontent.com/vzhong/semeval/master/dataset/SemEval2010_task8_scorer-v1.2/semeval2010_task8_scorer-v1.2.pl',
            cache_dir='scripts')
        chmod(eval_script, 0o777)

    classifier: TextClassifier = TextClassifier.load_from_file(model_file)
    #sentences_test: List[Sentence] = load_sentences_jsonl(test_file, attach_id=True)
    idx2item = load_idx2item(join(dirname(test_file), 'vocabulary/embeddings.csv'))

    load_dataset = dataset_loader[dataset_format]

    sentences_test: List[Sentence] = load_dataset(test_file, idx2item, is_test=False, attach_id=True)
    sentences_pred: List[Sentence] = load_dataset(test_file, idx2item, is_test=True, attach_id=True)                                                      

    sentences_pred = classifier.predict(sentences_pred)

    if semeval_scoring:
        id_labels_true = [(sentence.id_, sentence.labels[0]) for sentence in sentences_test]
        id_labels_pred = [(sentence.id_, sentence.labels[0]) for sentence in sentences_pred]

        input_files = []
        for id_labels in [id_labels_true, id_labels_pred]:
            tmp_file = NamedTemporaryFile(delete=True)
            input_files.append(tmp_file)
            with open(tmp_file.name, 'w') as f:
                for id_, label in id_labels:
                    f.write('{}\t{}\n'.format(id_, label.name))
            tmp_file.file.close()

        p = run([eval_script, input_files[0].name, input_files[1].name], stdout=PIPE, encoding='utf-8')
        main_result = p.stdout
        print(main_result)

    else:
        y_true = [sentence.labels[0].name for sentence in sentences_test]
        y_pred = [sentence.labels[0].name for sentence in sentences_pred] 
        print(classification_report(y_true, y_pred))
Exemplo n.º 18
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        aws_resource_path = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        )

        model_map["de-offensive-language"] = "/".join([
            aws_resource_path,
            "classy-offensive-de-rnn-cuda%3A0",
            "germ-eval-2018-task-1-v0.4.pt",
        ])

        model_map["en-sentiment"] = "/".join(
            [aws_resource_path, "classy-imdb-en-rnn-cuda%3A0", "imdb-v0.4.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name],
                                     cache_dir=cache_dir)

        return model_name
Exemplo n.º 19
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        aws_resource_path = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        )

        model_map["de-offensive-language"] = "/".join([
            aws_resource_path,
            "TEXT-CLASSIFICATION_germ-eval-2018_task-1",
            "germ-eval-2018-task-1.pt",
        ])

        model_map["en-sentiment"] = "/".join(
            [aws_resource_path, "TEXT-CLASSIFICATION_imdb", "imdb.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name],
                                     cache_dir=cache_dir)

        return model_name
Exemplo n.º 20
0
    def __init__(self, **kwargs):
        dataset = "feidegger"

        # cache Feidegger config file
        json_link = "https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json"
        json_local_path = cached_path(json_link, Path("datasets") / dataset)

        # cache Feidegger images
        dataset_info = json.load(open(json_local_path, "r"))
        images_cache_folder = os.path.join(os.path.dirname(json_local_path),
                                           "images")
        if not os.path.isdir(images_cache_folder):
            os.mkdir(images_cache_folder)
        for image_info in tqdm(dataset_info):
            name = os.path.basename(image_info["url"])
            filename = os.path.join(images_cache_folder, name)
            if not os.path.isfile(filename):
                urllib.request.urlretrieve(image_info["url"], filename)
            # replace image URL with local cached file
            image_info["url"] = filename

        feidegger_dataset: Dataset = FeideggerDataset(dataset_info, **kwargs)

        train_indices = list(
            np.where(np.in1d(feidegger_dataset.split, list(range(8))))[0])
        train = torch.utils.data.dataset.Subset(feidegger_dataset,
                                                train_indices)

        dev_indices = list(np.where(np.in1d(feidegger_dataset.split, [8]))[0])
        dev = torch.utils.data.dataset.Subset(feidegger_dataset, dev_indices)

        test_indices = list(np.where(np.in1d(feidegger_dataset.split, [9]))[0])
        test = torch.utils.data.dataset.Subset(feidegger_dataset, test_indices)

        super(FeideggerCorpus, self).__init__(train,
                                              dev,
                                              test,
                                              name="feidegger")
Exemplo n.º 21
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        aws_resource_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["de-offensive-language"] = "/".join([
            aws_resource_path,
            "classy-offensive-de-rnn-cuda%3A0",
            "germ-eval-2018-task-1-v0.4.pt",
        ])

        # English sentiment models
        model_map["sentiment"] = "/".join([
            hu_path, "sentiment-curated-distilbert",
            "sentiment-en-mix-distillbert.pt"
        ])
        model_map["en-sentiment"] = "/".join([
            hu_path, "sentiment-curated-distilbert",
            "sentiment-en-mix-distillbert.pt"
        ])
        model_map["sentiment-fast"] = "/".join([
            hu_path, "sentiment-curated-fasttext-rnn",
            "sentiment-en-mix-ft-rnn.pt"
        ])

        #Communicative Functions Model
        model_map["communicative-functions"] = "/".join(
            [hu_path, "comfunc", "communicative-functions-v0.5b.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name],
                                     cache_dir=cache_dir)

        return model_name
Exemplo n.º 22
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["de-offensive-language"] = "/".join(
            [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.8.pt"]
        )

        # English sentiment models
        model_map["sentiment"] = "/".join(
            [
                hu_path,
                "sentiment-curated-distilbert",
                "sentiment-en-mix-distillbert_4.pt",
            ]
        )
        model_map["en-sentiment"] = "/".join(
            [
                hu_path,
                "sentiment-curated-distilbert",
                "sentiment-en-mix-distillbert_4.pt",
            ]
        )
        model_map["sentiment-fast"] = "/".join(
            [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn_v8.pt"]
        )

        # Communicative Functions Model
        model_map["communicative-functions"] = "/".join([hu_path, "comfunc", "communicative-functions.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        return model_name
Exemplo n.º 23
0
    def download_dataset(task: NLPTask):

        # conll 2000 chunking task
        if task == NLPTask.CONLL_2000:
            conll_2000_path = 'https://www.clips.uantwerpen.be/conll2000/chunking/'
            data_file = Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt'
            if not data_file.is_file():
                cached_path(f'{conll_2000_path}train.txt.gz', Path('datasets') / task.value)
                cached_path(f'{conll_2000_path}test.txt.gz', Path('datasets') / task.value)
                import gzip, shutil
                with gzip.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt.gz',
                               'rb') as f_in:
                    with open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt', 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                with gzip.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'test.txt.gz',
                               'rb') as f_in:
                    with open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'test.txt', 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

        if task == NLPTask.IMDB:
            imdb_acl_path = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
            data_path = Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value
            data_file = data_path / 'train.txt'
            if not data_file.is_file():
                cached_path(imdb_acl_path, Path('datasets') / task.value)
                import tarfile
                with tarfile.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'aclImdb_v1.tar.gz',
                                  'r:gz') as f_in:
                    datasets = ['train', 'test']
                    labels = ['pos', 'neg']

                    for label in labels:
                        for dataset in datasets:
                            f_in.extractall(data_path, members=[m for m in f_in.getmembers()
                                                                if f'{dataset}/{label}' in m.name])
                            with open(f'{data_path}/{dataset}.txt', 'at') as f_p:
                                current_path = data_path / 'aclImdb' / dataset / label
                                for file_name in current_path.iterdir():
                                    if file_name.is_file() and file_name.name.endswith('.txt'):
                                        f_p.write(f'__label__{label} '
                                                  + file_name.open('rt', encoding='utf-8').read() + '\n')
Exemplo n.º 24
0
    def __init__(self,
                 base_path: Union[str, Path] = None,
                 in_memory: bool = True):
        if not base_path:
            base_path = flair.cache_root / "datasets"
        else:
            base_path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        data_folder = base_path / dataset_name

        # TODO: change data source to original CoNLL04 -- this dataset has span formatting errors
        # download data if necessary
        conll04_url = (
            "https://raw.githubusercontent.com/bekou/multihead_joint_entity_relation_extraction/master/data/CoNLL04/"
        )
        data_file = data_folder / "conll04-train.conllu"

        if True or not data_file.is_file():
            source_data_folder = data_folder / "original"
            cached_path(f"{conll04_url}train.txt", source_data_folder)
            cached_path(f"{conll04_url}dev.txt", source_data_folder)
            cached_path(f"{conll04_url}test.txt", source_data_folder)

            self.convert_to_conllu(
                source_data_folder=source_data_folder,
                data_folder=data_folder,
            )

        super(RE_ENGLISH_CONLL04, self).__init__(
            data_folder,
            in_memory=in_memory,
            column_format={
                1: "text",
                2: "ner"
            },
            comment_symbol="# ",
        )
Exemplo n.º 25
0
    def __init__(
        self,
        **corpusargs,
    ):
        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        data_folder = Path(flair.cache_root) / "datasets" / dataset_name

        # download data if necessary
        if not (data_folder / "train.txt").is_file():

            # download senteval datasets if necessary und unzip
            cached_path(
                'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.train',
                Path("datasets") / dataset_name / 'raw')
            cached_path(
                'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.test',
                Path("datasets") / dataset_name / 'raw')
            cached_path(
                'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.dev',
                Path("datasets") / dataset_name / 'raw')

            # convert to FastText format
            for split in ['train', 'dev', 'test']:
                with open(data_folder / f"{split}.txt", "w") as train_file:

                    with open(data_folder / 'raw' / f'stsa.fine.{split}',
                              encoding="latin1") as file:
                        for line in file:
                            train_file.write(f"__label__{line[0]} {line[2:]}")

        super(SENTEVAL_SST_GRANULAR, self).__init__(
            data_folder,
            tokenizer=segtok_tokenizer,
            **corpusargs,
        )
Exemplo n.º 26
0
    def __init__(self,
                 base_path: Union[str, Path] = None,
                 in_memory: bool = True):

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name
        web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master"
        cached_path(f"{web_path}/mt_mudt-ud-dev.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{web_path}/mt_mudt-ud-test.conllu",
                    Path("datasets") / dataset_name)
        cached_path(f"{web_path}/mt_mudt-ud-train.conllu",
                    Path("datasets") / dataset_name)

        super(UD_MALTESE, self).__init__(data_folder, in_memory=in_memory)
Exemplo n.º 27
0
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = True,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
        cached_path(f"{ner_finnish_path}2014.train.csv",
                    Path("datasets") / dataset_name)
        cached_path(f"{ner_finnish_path}2014.dev.csv",
                    Path("datasets") / dataset_name)
        cached_path(f"{ner_finnish_path}2015.test.csv",
                    Path("datasets") / dataset_name)

        _remove_lines_without_annotations(
            data_file=Path(data_folder / "digitoday.2015.test.csv"))

        super(NER_FINNISH, self).__init__(data_folder,
                                          columns,
                                          tag_to_bioes=tag_to_bioes,
                                          in_memory=in_memory,
                                          skip_first_line=True)
Exemplo n.º 28
0
    def _fetch_model(model_name) -> str:

        model_map = {}

        aws_resource_path_v04 = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        )

        model_map["ner"] = "/".join(
            [aws_resource_path_v04, "NER-conll03-english", "en-ner-conll03-v0.4.pt"]
        )

        model_map["ner-fast"] = "/".join(
            [
                aws_resource_path_v04,
                "NER-conll03--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward-fast%2Bnews-backward-fast-normal-locked0.5-word0.05--release_4",
                "en-ner-fast-conll03-v0.4.pt",
            ]
        )

        model_map["ner-ontonotes"] = "/".join(
            [
                aws_resource_path_v04,
                "release-ner-ontonotes-0",
                "en-ner-ontonotes-v0.4.pt",
            ]
        )

        model_map["ner-ontonotes-fast"] = "/".join(
            [
                aws_resource_path_v04,
                "release-ner-ontonotes-fast-0",
                "en-ner-ontonotes-fast-v0.4.pt",
            ]
        )

        for key in ["ner-multi", "multi-ner"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "release-quadner-512-l2-multi-embed",
                    "quadner-large.pt",
                ]
            )

        for key in ["ner-multi-fast", "multi-ner-fast"]:
            model_map[key] = "/".join(
                [aws_resource_path_v04, "NER-multi-fast", "ner-multi-fast.pt"]
            )

        for key in ["ner-multi-fast-learn", "multi-ner-fast-learn"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "NER-multi-fast-evolve",
                    "ner-multi-fast-learn.pt",
                ]
            )

        model_map["pos"] = "/".join(
            [
                aws_resource_path_v04,
                "POS-ontonotes--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0",
                "en-pos-ontonotes-v0.4.pt",
            ]
        )

        model_map["pos-fast"] = "/".join(
            [
                aws_resource_path_v04,
                "release-pos-fast-0",
                "en-pos-ontonotes-fast-v0.4.pt",
            ]
        )

        for key in ["pos-multi", "multi-pos"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "release-dodekapos-512-l2-multi",
                    "pos-multi-v0.1.pt",
                ]
            )

        for key in ["pos-multi-fast", "multi-pos-fast"]:
            model_map[key] = "/".join(
                [aws_resource_path_v04, "UPOS-multi-fast", "pos-multi-fast.pt"]
            )

        model_map["frame"] = "/".join(
            [aws_resource_path_v04, "release-frame-1", "en-frame-ontonotes-v0.4.pt"]
        )

        model_map["frame-fast"] = "/".join(
            [
                aws_resource_path_v04,
                "release-frame-fast-0",
                "en-frame-ontonotes-fast-v0.4.pt",
            ]
        )

        model_map["chunk"] = "/".join(
            [
                aws_resource_path_v04,
                "NP-conll2000--h256-l1-b32-p3-0.5-%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0",
                "en-chunk-conll2000-v0.4.pt",
            ]
        )

        model_map["chunk-fast"] = "/".join(
            [
                aws_resource_path_v04,
                "release-chunk-fast-0",
                "en-chunk-conll2000-fast-v0.4.pt",
            ]
        )

        model_map["da-pos"] = "/".join(
            [aws_resource_path_v04, "POS-danish", "da-pos-v0.1.pt"]
        )

        model_map["da-ner"] = "/".join(
            [aws_resource_path_v04, "NER-danish", "da-ner-v0.1.pt"]
        )

        model_map["de-pos"] = "/".join(
            [aws_resource_path_v04, "release-de-pos-0", "de-pos-ud-hdt-v0.4.pt"]
        )

        model_map["de-pos-fine-grained"] = "/".join(
            [
                aws_resource_path_v04,
                "POS-fine-grained-german-tweets",
                "de-pos-twitter-v0.1.pt",
            ]
        )

        model_map["de-ner"] = "/".join(
            [aws_resource_path_v04, "release-de-ner-0", "de-ner-conll03-v0.4.pt"]
        )

        model_map["de-ner-germeval"] = "/".join(
            [aws_resource_path_v04, "NER-germeval", "de-ner-germeval-0.4.1.pt"]
        )

        model_map["fr-ner"] = "/".join(
            [aws_resource_path_v04, "release-fr-ner-0", "fr-ner-wikiner-0.4.pt"]
        )
        model_map["nl-ner"] = "/".join(
            [aws_resource_path_v04, "NER-conll2002-dutch", "nl-ner-conll02-v0.1.pt"]
        )
        model_map["ml-pos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt"
        model_map["ml-xpos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt"

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        return model_name
Exemplo n.º 29
0
    def _fetch_model(model_name) -> str:

        model_map = {}
        aws_resource_path = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2"
        )
        aws_resource_path_v04 = (
            "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        )

        model_map["ner"] = "/".join(
            [aws_resource_path_v04, "NER-conll03-english", "en-ner-conll03-v0.4.pt"]
        )

        model_map["ner-fast"] = "/".join(
            [
                aws_resource_path,
                "NER-conll03--h256-l1-b32-experimental--fast-v0.2",
                "en-ner-fast-conll03-v0.2.pt",
            ]
        )

        model_map["ner-ontonotes"] = "/".join(
            [
                aws_resource_path,
                "NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward%2Bnews-backward--v0.2",
                "en-ner-ontonotes-v0.3.pt",
            ]
        )

        model_map["ner-ontonotes-fast"] = "/".join(
            [
                aws_resource_path,
                "NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward-fast%2Bnews-backward-fast--v0.2",
                "en-ner-ontonotes-fast-v0.3.pt",
            ]
        )

        for key in ["ner-multi", "multi-ner"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "release-quadner-512-l2-multi-embed",
                    "quadner-large.pt",
                ]
            )

        for key in ["ner-multi-fast", "multi-ner-fast"]:
            model_map[key] = "/".join(
                [aws_resource_path_v04, "NER-multi-fast", "ner-multi-fast.pt"]
            )

        for key in ["ner-multi-fast-learn", "multi-ner-fast-learn"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "NER-multi-fast-evolve",
                    "ner-multi-fast-learn.pt",
                ]
            )

        model_map["pos"] = "/".join(
            [
                aws_resource_path,
                "POS-ontonotes--h256-l1-b32-%2Bmix-forward%2Bmix-backward--v0.2",
                "en-pos-ontonotes-v0.2.pt",
            ]
        )

        model_map["pos-fast"] = "/".join(
            [
                aws_resource_path,
                "POS-ontonotes--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2",
                "en-pos-ontonotes-fast-v0.2.pt",
            ]
        )

        for key in ["pos-multi", "multi-pos"]:
            model_map[key] = "/".join(
                [
                    aws_resource_path_v04,
                    "release-dodekapos-512-l2-multi",
                    "pos-multi-v0.1.pt",
                ]
            )

        for key in ["pos-multi-fast", "multi-pos-fast"]:
            model_map[key] = "/".join(
                [aws_resource_path_v04, "UPOS-multi-fast", "pos-multi-fast.pt"]
            )

        model_map["frame"] = "/".join(
            [
                aws_resource_path,
                "FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward%2Bnews-backward--v0.2",
                "en-frame-ontonotes-v0.2.pt",
            ]
        )

        model_map["frame-fast"] = "/".join(
            [
                aws_resource_path,
                "FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward-fast%2Bnews-backward-fast--v0.2",
                "en-frame-ontonotes-fast-v0.2.pt",
            ]
        )

        model_map["chunk"] = "/".join(
            [
                aws_resource_path,
                "NP-conll2000--h256-l1-b32-%2Bnews-forward%2Bnews-backward--v0.2",
                "en-chunk-conll2000-v0.2.pt",
            ]
        )

        model_map["chunk-fast"] = "/".join(
            [
                aws_resource_path,
                "NP-conll2000--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2",
                "en-chunk-conll2000-fast-v0.2.pt",
            ]
        )

        model_map["de-pos"] = "/".join(
            [
                aws_resource_path,
                "UPOS-udgerman--h256-l1-b8-%2Bgerman-forward%2Bgerman-backward--v0.2",
                "de-pos-ud-v0.2.pt",
            ]
        )

        model_map["de-pos-fine-grained"] = "/".join(
            [
                aws_resource_path_v04,
                "POS-fine-grained-german-tweets",
                "de-pos-twitter-v0.1.pt",
            ]
        )

        model_map["de-ner"] = "/".join(
            [
                aws_resource_path,
                "NER-conll03ger--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2",
                "de-ner-conll03-v0.3.pt",
            ]
        )

        model_map["de-ner-germeval"] = "/".join(
            [
                aws_resource_path,
                "NER-germeval--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2",
                "de-ner-germeval-v0.3.pt",
            ]
        )

        model_map["fr-ner"] = "/".join(
            [aws_resource_path, "NER-aij-wikiner-fr-wp3", "fr-ner.pt"]
        )
        model_map["nl-ner"] = "/".join(
            [aws_resource_path_v04, "NER-conll2002-dutch", "nl-ner-conll02-v0.1.pt"]
        )

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        return model_name
Exemplo n.º 30
0
    def _fetch_model(model_name) -> str:

        # core Flair models on Huggingface ModelHub
        huggingface_model_map = {
            "ner": "flair/ner-english",
            "ner-fast": "flair/ner-english-fast",
            "ner-ontonotes": "flair/ner-english-ontonotes",
            "ner-ontonotes-fast": "flair/ner-english-ontonotes-fast",
            # Large NER models,
            "ner-large": "flair/ner-english-large",
            "ner-ontonotes-large": "flair/ner-english-ontonotes-large",
            "de-ner-large": "flair/ner-german-large",
            "nl-ner-large": "flair/ner-dutch-large",
            "es-ner-large": "flair/ner-spanish-large",
            # Multilingual NER models
            "ner-multi": "flair/ner-multi",
            "multi-ner": "flair/ner-multi",
            "ner-multi-fast": "flair/ner-multi-fast",
            # English POS models
            "upos": "flair/upos-english",
            "upos-fast": "flair/upos-english-fast",
            "pos": "flair/pos-english",
            "pos-fast": "flair/pos-english-fast",
            # Multilingual POS models
            "pos-multi": "flair/upos-multi",
            "multi-pos": "flair/upos-multi",
            "pos-multi-fast": "flair/upos-multi-fast",
            "multi-pos-fast": "flair/upos-multi-fast",
            # English SRL models
            "frame": "flair/frame-english",
            "frame-fast": "flair/frame-english-fast",
            # English chunking models
            "chunk": "flair/chunk-english",
            "chunk-fast": "flair/chunk-english-fast",
            # Language-specific NER models
            "da-ner": "flair/ner-danish",
            "de-ner": "flair/ner-german",
            "de-ler": "flair/ner-german-legal",
            "de-ner-legal": "flair/ner-german-legal",
            "fr-ner": "flair/ner-french",
            "nl-ner": "flair/ner-dutch",
        }

        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        hu_model_map = {
            # English NER models
            "ner":
            "/".join([hu_path, "ner", "en-ner-conll03-v0.4.pt"]),
            "ner-pooled":
            "/".join([hu_path, "ner-pooled", "en-ner-conll03-pooled-v0.5.pt"]),
            "ner-fast":
            "/".join([hu_path, "ner-fast", "en-ner-fast-conll03-v0.4.pt"]),
            "ner-ontonotes":
            "/".join([hu_path, "ner-ontonotes", "en-ner-ontonotes-v0.4.pt"]),
            "ner-ontonotes-fast":
            "/".join([
                hu_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"
            ]),
            # Multilingual NER models
            "ner-multi":
            "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "multi-ner":
            "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "ner-multi-fast":
            "/".join([hu_path, "multi-ner-fast", "ner-multi-fast.pt"]),
            # English POS models
            "upos":
            "/".join([hu_path, "upos", "en-pos-ontonotes-v0.4.pt"]),
            "upos-fast":
            "/".join([hu_path, "upos-fast", "en-upos-ontonotes-fast-v0.4.pt"]),
            "pos":
            "/".join([hu_path, "pos", "en-pos-ontonotes-v0.5.pt"]),
            "pos-fast":
            "/".join([hu_path, "pos-fast", "en-pos-ontonotes-fast-v0.5.pt"]),
            # Multilingual POS models
            "pos-multi":
            "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "multi-pos":
            "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "pos-multi-fast":
            "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            "multi-pos-fast":
            "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            # English SRL models
            "frame":
            "/".join([hu_path, "frame", "en-frame-ontonotes-v0.4.pt"]),
            "frame-fast":
            "/".join(
                [hu_path, "frame-fast", "en-frame-ontonotes-fast-v0.4.pt"]),
            # English chunking models
            "chunk":
            "/".join([hu_path, "chunk", "en-chunk-conll2000-v0.4.pt"]),
            "chunk-fast":
            "/".join(
                [hu_path, "chunk-fast", "en-chunk-conll2000-fast-v0.4.pt"]),
            # Danish models
            "da-pos":
            "/".join([hu_path, "da-pos", "da-pos-v0.1.pt"]),
            "da-ner":
            "/".join([hu_path, "NER-danish", "da-ner-v0.1.pt"]),
            # German models
            "de-pos":
            "/".join([hu_path, "de-pos", "de-pos-ud-hdt-v0.5.pt"]),
            "de-pos-tweets":
            "/".join([hu_path, "de-pos-tweets", "de-pos-twitter-v0.1.pt"]),
            "de-ner":
            "/".join([hu_path, "de-ner", "de-ner-conll03-v0.4.pt"]),
            "de-ner-germeval":
            "/".join([hu_path, "de-ner-germeval", "de-ner-germeval-0.4.1.pt"]),
            "de-ler":
            "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]),
            "de-ner-legal":
            "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]),
            # French models
            "fr-ner":
            "/".join([hu_path, "fr-ner", "fr-ner-wikiner-0.4.pt"]),
            # Dutch models
            "nl-ner":
            "/".join([hu_path, "nl-ner", "nl-ner-bert-conll02-v0.8.pt"]),
            "nl-ner-rnn":
            "/".join([hu_path, "nl-ner-rnn", "nl-ner-conll02-v0.5.pt"]),
            # Malayalam models
            "ml-pos":
            "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt",
            "ml-upos":
            "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt",
            # Portuguese models
            "pt-pos-clinical":
            "/".join([
                hu_path,
                "pt-pos-clinical",
                "pucpr-flair-clinical-pos-tagging-best-model.pt",
            ]),
            # Keyphase models
            "keyphrase":
            "/".join([hu_path, "keyphrase", "keyphrase-en-scibert.pt"]),
            "negation-speculation":
            "/".join([
                hu_path, "negation-speculation",
                "negation-speculation-model.pt"
            ]),
            # Biomedical models
            "hunflair-paper-cellline":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "cellline",
                "hunflair-celline-v1.0.pt",
            ]),
            "hunflair-paper-chemical":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "chemical",
                "hunflair-chemical-v1.0.pt",
            ]),
            "hunflair-paper-disease":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "disease",
                "hunflair-disease-v1.0.pt",
            ]),
            "hunflair-paper-gene":
            "/".join([
                hu_path, "hunflair_smallish_models", "gene",
                "hunflair-gene-v1.0.pt"
            ]),
            "hunflair-paper-species":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "species",
                "hunflair-species-v1.0.pt",
            ]),
            "hunflair-cellline":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "cellline",
                "hunflair-celline-v1.0.pt",
            ]),
            "hunflair-chemical":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-chemical",
                "hunflair-chemical-full-v1.0.pt",
            ]),
            "hunflair-disease":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-disease",
                "hunflair-disease-full-v1.0.pt",
            ]),
            "hunflair-gene":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-gene",
                "hunflair-gene-full-v1.0.pt",
            ]),
            "hunflair-species":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-species",
                "hunflair-species-full-v1.1.pt",
            ]),
        }

        cache_dir = Path("models")

        get_from_model_hub = False

        # check if model name is a valid local file
        if Path(model_name).exists():
            model_path = model_name

        # check if model key is remapped to HF key - if so, print out information
        elif model_name in huggingface_model_map:

            # get mapped name
            hf_model_name = huggingface_model_map[model_name]

            # use mapped name instead
            model_name = hf_model_name
            get_from_model_hub = True

        # if not, check if model key is remapped to direct download location. If so, download model
        elif model_name in hu_model_map:
            model_path = cached_path(hu_model_map[model_name],
                                     cache_dir=cache_dir)

        # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
        elif model_name == "de-historic-indirect":
            model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/indirect.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "indirect.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "indirect" /
                             "final-model.pt")

        elif model_name == "de-historic-direct":
            model_file = flair.cache_root / cache_dir / "direct" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/direct.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "direct.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "direct" /
                             "final-model.pt")

        elif model_name == "de-historic-reported":
            model_file = flair.cache_root / cache_dir / "reported" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/reported.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "reported.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "reported" /
                             "final-model.pt")

        elif model_name == "de-historic-free-indirect":
            model_file = flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/freeIndirect.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "freeIndirect.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "freeIndirect" /
                             "final-model.pt")

        # for all other cases (not local file or special download location), use HF model hub
        else:
            get_from_model_hub = True

        # if not a local file, get from model hub
        if get_from_model_hub:
            hf_model_name = "pytorch_model.bin"
            revision = "main"

            if "@" in model_name:
                model_name_split = model_name.split("@")
                revision = model_name_split[-1]
                model_name = model_name_split[0]

            # use model name as subfolder
            if "/" in model_name:
                model_folder = model_name.split("/", maxsplit=1)[1]
            else:
                model_folder = model_name

            # Lazy import
            from huggingface_hub import cached_download, hf_hub_url

            url = hf_hub_url(model_name,
                             revision=revision,
                             filename=hf_model_name)

            try:
                model_path = cached_download(
                    url=url,
                    library_name="flair",
                    library_version=flair.__version__,
                    cache_dir=flair.cache_root / "models" / model_folder,
                )
            except HTTPError:
                # output information
                log.error("-" * 80)
                log.error(
                    f"ACHTUNG: The key '{model_name}' was neither found on the ModelHub nor is this a valid path to a file on your system!"
                )
                # log.error(f" - Error message: {e}")
                log.error(
                    " -> Please check https://huggingface.co/models?filter=flair for all available models."
                )
                log.error(
                    " -> Alternatively, point to a model file on your local drive."
                )
                log.error("-" * 80)
                Path(flair.cache_root / "models" /
                     model_folder).rmdir()  # remove folder again if not valid

        return model_path
Exemplo n.º 31
0
    def load(model: str):
        model_file = None
        aws_resource_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2'

        if model.lower() == 'ner':
            base_path = '/'.join([aws_resource_path,
                                  'NER-conll03--h256-l1-b32-%2Bglove%2Bnews-forward%2Bnews-backward--v0.2',
                                  'en-ner-conll03-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'ner-fast':
            base_path = '/'.join([aws_resource_path,
                                  'NER-conll03--h256-l1-b32-experimental--fast-v0.2',
                                  'en-ner-fast-conll03-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'ner-ontonotes':
            base_path = '/'.join([aws_resource_path,
                                  'NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward%2Bnews-backward--v0.2',
                                  'en-ner-ontonotes-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'ner-ontonotes-fast':
            base_path = '/'.join([aws_resource_path,
                                  'NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward-fast%2Bnews-backward-fast--v0.2',
                                  'en-ner-ontonotes-fast-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'pos':
            base_path = '/'.join([aws_resource_path,
                                  'POS-ontonotes--h256-l1-b32-%2Bmix-forward%2Bmix-backward--v0.2',
                                  'en-pos-ontonotes-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'pos-fast':
            base_path = '/'.join([aws_resource_path,
                                  'POS-ontonotes--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2',
                                  'en-pos-ontonotes-fast-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'frame':
            base_path = '/'.join([aws_resource_path,
                                  'FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward%2Bnews-backward--v0.2',
                                  'en-frame-ontonotes-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'frame-fast':
            base_path = '/'.join([aws_resource_path,
                                  'FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward-fast%2Bnews-backward-fast--v0.2',
                                  'en-frame-ontonotes-fast-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'chunk':
            base_path = '/'.join([aws_resource_path,
                                  'NP-conll2000--h256-l1-b32-%2Bnews-forward%2Bnews-backward--v0.2',
                                  'en-chunk-conll2000-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'chunk-fast':
            base_path = '/'.join([aws_resource_path,
                                  'NP-conll2000--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2',
                                  'en-chunk-conll2000-fast-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'de-pos':
            base_path = '/'.join([aws_resource_path,
                                  'UPOS-udgerman--h256-l1-b8-%2Bgerman-forward%2Bgerman-backward--v0.2',
                                  'de-pos-ud-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'de-ner':
            base_path = '/'.join([aws_resource_path,
                                  'NER-conll03ger--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2',
                                  'de-ner-conll03-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model.lower() == 'de-ner-germeval':
            base_path = '/'.join([aws_resource_path,
                                  'NER-germeval--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2',
                                  'de-ner-germeval-v0.2.pt'])
            model_file = cached_path(base_path, cache_dir='models')

        if model_file is not None:
            tagger: SequenceTagger = SequenceTagger.load_from_file(model_file)
            return tagger