def _download_if_required(self): # download the model weights and data to client machine cached_path( path=self.final_model_dir, url= "https://parsect-models.s3-ap-southeast-1.amazonaws.com/lstm_crf_parscit_final.zip", )
def _download_if_required(self): # download the model weights and data to client machine cached_path( path=f"{self.final_model_dir}.zip", url="https://parsect-models.s3-ap-southeast-1.amazonaws.com/i2b2.zip", unzip=True, )
def __init__( self, dropout_value: float = 0.5, datasets_manager: DatasetsManager = None, word_tokens_namespace: str = "tokens", device: torch.device = torch.device("cpu"), fine_tune: bool = False, ): super(ElmoEmbedder, self).__init__() # Sometimes you need two different tensors that are # two different linear combination of representations # TODO: change this in-case you need 2 representations self.num_output_representations = 1 self.dropout_value = dropout_value self.datasets_manager = datasets_manager self.device = torch.device(device) if isinstance(device, str) else device self.msg_printer = wasabi.Printer() self.word_tokens_namespace = word_tokens_namespace self.fine_tune = fine_tune self.embedder_name = "ElmoEmbedder" self.elmo_options_file = pathlib.Path(ELMO_OPTIONS_FILE) self.elmo_weights_file = pathlib.Path(ELMO_WEIGHTS_FILE) if not self.elmo_options_file.is_file(): self.elmo_options_file = cached_path( url=EMBEDDING_FILE_URLS["ELMO_OPTIONS_FILE"], path=self.elmo_options_file, unzip=False, ) self.elmo_weights_file = cached_path( url=EMBEDDING_FILE_URLS["ELMO_WEIGHTS_FILE"], path=self.elmo_weights_file, unzip=False, ) with self.msg_printer.loading("Loading Elmo Object"): self.elmo: nn.Module = Elmo( options_file=self.elmo_options_file, weight_file=self.elmo_weights_file, num_output_representations=self.num_output_representations, dropout=self.dropout_value, requires_grad=fine_tune, ) self.msg_printer.good(f"Finished Loading ELMO object")
def _get_data(self): data_manager = SeqLabellingDatasetManager( train_filename=cached_path( path=self.data_dir.joinpath("parscit.train"), url=self.train_data_file_url, unzip=False, ), dev_filename=cached_path( path=self.data_dir.joinpath("parscit.dev"), url=self.dev_data_file_url, unzip=False, ), test_filename=cached_path( path=self.data_dir.joinpath("parscit.test"), url=self.test_data_file_url, unzip=False, ), ) return data_manager
def _get_data(self): train_file = cached_path( path=self.data_dir.joinpath("scicite.train"), url=self.train_data_url, unzip=False, ) dev_file = cached_path( path=self.data_dir.joinpath("scicite.dev"), url=self.dev_data_url, unzip=False, ) test_file = cached_path( path=self.data_dir.joinpath("scicite.test"), url=self.test_data_url, unzip=False, ) data_manager = TextClassificationDatasetManager( train_filename=train_file, dev_filename=dev_file, test_filename=test_file) return data_manager
def _get_data(self): train_filename = cached_path( path=self.data_dir.joinpath("i2b2.train"), url=self.train_data_url, unzip=False, ) dev_filename = cached_path(path=self.data_dir.joinpath("i2b2.dev"), url=self.dev_data_url, unzip=False) test_filename = cached_path(path=self.data_dir.joinpath("i2b2.dev"), url=self.dev_data_url, unzip=False) data_manager = CoNLLDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["NER", "NER", "NER"], train_only="ner", ) return data_manager
def _get_data(self): train_filename = self.data_dir.joinpath("sectLabel.train") dev_filename = self.data_dir.joinpath("sectLabel.dev") test_filename = self.data_dir.joinpath("sectLabel.test") train_filename = cached_path(path=train_filename, url=self.train_data_url, unzip=False) dev_filename = cached_path(path=dev_filename, url=self.dev_data_url, unzip=False) test_filename = cached_path(path=test_filename, url=self.test_data_url, unzip=False) data_manager = TextClassificationDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, ) return data_manager
def get_preloaded_filename(self): filename = None url = None if self.embedding_type == "glove_6B_50": filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.50d.txt") url = EMBEDDING_FILE_URLS["GLOVE_FILE"] elif self.embedding_type == "glove_6B_100": filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.100d.txt") url = EMBEDDING_FILE_URLS["GLOVE_FILE"] elif self.embedding_type == "glove_6B_200": filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.200d.txt") url = EMBEDDING_FILE_URLS["GLOVE_FILE"] elif self.embedding_type == "glove_6B_300": filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.300d.txt") url = EMBEDDING_FILE_URLS["GLOVE_FILE"] elif self.embedding_type == "parscit": filename = os.path.join(EMBEDDING_CACHE_DIR, "vectors_with_unk.kv") url = EMBEDDING_FILE_URLS["PARSCIT_EMBEDDINGS"] elif self.embedding_type == "lample_conll": filename = os.path.join(EMBEDDING_CACHE_DIR, "lample_conll") url = EMBEDDING_FILE_URLS["LAMPLE_CONLL"] else: raise ValueError( f"Check the embedding type. It has to be one of {self.allowed_embedding_types}" ) url_path = pathlib.Path(url) destination_path = url_path.parts[-1] destination_path = self.embedding_cache_dir.joinpath(destination_path) _ = cached_path(url=url, unzip=True, path=destination_path) return filename
def _download_if_required(self): cached_path( path=self.final_model_dir, url= "https://parsect-models.s3-ap-southeast-1.amazonaws.com/sectlabel_elmo_bilstm.zip", )
def _download_if_required(self): cached_path( path=self.final_model_dir, url="https://parsect-models.s3-ap-southeast-1.amazonaws.com/genericsect_bow_elmo.zip", )