def convert_conll2003_ner_to_bioul(filename: str, out_filename: str): """ Converts the conll2003 file to bilou tagged strings and writes it to out_filename The out_filename will have the first column as word and the next three columns as the NER tags Parameters ---------- filename: str Convert the file in conll2003 format to bioul tags out_filename: str Writes the file to bioul format Returns ------- None """ msg_printer = wasabi.Printer() lines: List[List[str]] = [] labels: List[List[str]] = [] with open(filename) as fp: lines_: List[str] = [] labels_: List[str] = [] # every list is a label for one namespace for text in fp: text_ = text.strip() if bool(text_): line_labels = text_.split() line_ = line_labels[0] label_ = line_labels[3] # all 3 tags lines_.append(line_) labels_.append(label_) elif text_ == "-DOCSTART-": # skip next empty line as well lines_ = [] labels_ = [] next(fp) else: if len(lines_) > 0 and len(labels_) > 0: lines.append(lines_) labels.append(labels_) lines_ = [] labels_ = [] bilou_tags = [] for label in labels: bilou_ = to_bioul(tag_sequence=label, encoding="IOB1") bilou_tags.append(bilou_) with msg_printer.loading(f"writing BILOU tags for {filename}"): with open(out_filename, "w") as fp: for line, bilou_tags_ in zip(lines, bilou_tags): assert len(line) == len(bilou_tags_) for word, tag in zip(line, bilou_tags_): fp.write(" ".join([word, tag, tag, tag])) fp.write("\n") fp.write("\n") msg_printer.good(f"Finished writing BILOU tags for {filename}")
def __init__(self, bert_type: str): super(TokenizerForBert, self).__init__() self.bert_type = bert_type self.msg_printer = wasabi.Printer() self.allowed_bert_types = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "scibert-base-cased", "scibert-sci-cased", "scibert-base-uncased", "scibert-sci-uncased", ] self.scibert_foldername_mapping = { "scibert-base-cased": "scibert_basevocab_cased", "scibert-sci-cased": "scibert_scivocab_cased", "scibert-base-uncased": "scibert_basevocab_uncased", "scibert-sci-uncased": "scibert_scivocab_uncased", } assert bert_type in self.allowed_bert_types, self.msg_printer.fail( f"You passed {bert_type} for attribute bert_type." f"The allowed types are {self.allowed_bert_types}") self.vocab_type_or_filename = None if "scibert" in self.bert_type: foldername = self.scibert_foldername_mapping[self.bert_type] self.vocab_type_or_filename = os.path.join(EMBEDDING_CACHE_DIR, foldername, "vocab.txt") else: self.vocab_type_or_filename = self.bert_type with self.msg_printer.loading("Loading Bert model"): self.tokenizer = BertTokenizer.from_pretrained( self.vocab_type_or_filename)
def __init__(self): self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) self.final_model_dir = self.models_cache_dir.joinpath( "genericsect_bow_elmo") if not self.models_cache_dir.is_dir(): self.models_cache_dir.mkdir(parents=True) self.model_filepath = self.final_model_dir.joinpath("best_model.pt") self.data_dir = pathlib.Path(DATA_DIR) if not self.data_dir.is_dir(): self.data_dir.mkdir(parents=True) self.train_data_url = DATA_FILE_URLS["GENERIC_SECTION_TRAIN_FILE"] self.dev_data_url = DATA_FILE_URLS["GENERIC_SECTION_DEV_FILE"] self.test_data_url = DATA_FILE_URLS["GENERIC_SECTION_TEST_FILE"] self.msg_printer = wasabi.Printer() self._download_if_required() self.data_manager = self._get_data() self.hparams = self._get_hparams() self.model = self._get_model() self.infer = self._get_infer_client() self.cli_interact = SciWINGInteract(self.infer)
def __init__( self, encoder: BowElmoEmbedder, encoding_dim: int, num_classes: int, classification_layer_bias: bool = True, ): """ Parameters ---------- encoder : BowElmoEmbedder Bag of words Elmo Embedder, that embeds words by aggregating elmo representations across words either by summing or averaging or any other strategy encoding_dim : int Dimension of the encoding of text num_classes : int Number of classes in the dataset classification_layer_bias : bool whether to add bias to the classification layer This is set to false only for testing or debugging purpose Else please keep this as true """ super(BowElmoLinearClassifier, self).__init__() self.encoder = encoder self.encoding_dim = encoding_dim self.num_classes = num_classes self.classification_layer_bias = classification_layer_bias self.classification_layer = nn.Linear( encoding_dim, num_classes, bias=self.classification_layer_bias) self._loss = CrossEntropyLoss() self.msg_printer = wasabi.Printer()
def __init__(self): super(I2B2NER, self).__init__() self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) if not self.models_cache_dir.is_dir(): self.models_cache_dir.mkdir(parents=True) self.final_model_dir = self.models_cache_dir.joinpath("i2b2") self.model_filepath = self.final_model_dir.joinpath("best_model.pt") self.data_dir = pathlib.Path(DATA_DIR) if not self.data_dir.is_dir(): self.data_dir.mkdir() self.train_data_url = DATA_FILE_URLS["I2B2_TRAIN"] self.dev_data_url = DATA_FILE_URLS["I2B2_DEV"] self.test_data_url = DATA_FILE_URLS["I2B2_DEV"] self.msg_printer = wasabi.Printer() self._download_if_required() self.hparams = self._get_hparams() self.data_manager = self._get_data() self.model: nn.Module = self._get_model() self.infer = self._get_infer_client() self.vis_tagger = VisTagging() self.cli_interact = SciWINGInteract(self.infer)
def __init__( self, model: nn.Module, model_filepath: str, dataset: BaseSeqLabelingDataset, device: Optional[Union[str, torch.device]] = torch.device("cpu"), ): super(ParscitInference, self).__init__(model=model, model_filepath=model_filepath, dataset=dataset, device=device) self.msg_printer = wasabi.Printer() self.labelname2idx_mapping = self.dataset.get_classname2idx() self.idx2labelname_mapping = { idx: label_name for label_name, idx in self.labelname2idx_mapping.items() } self.metrics_calculator = TokenClassificationAccuracy( idx2labelname_mapping=self.idx2labelname_mapping) self.output_analytics = None self.output_df = None self.batch_size = 32 self.load_model() num_categories = self.dataset.get_num_classes() categories = [ self.idx2labelname_mapping[idx] for idx in range(num_categories) ] self.seq_tagging_visualizer = VisTagging(tags=categories)
def __init__(self, log_file: str = None, device: str = "cpu"): self.device = device self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) if not self.models_cache_dir.is_dir(): self.models_cache_dir.mkdir(parents=True) self.final_model_dir = self.models_cache_dir.joinpath( "sectlabel_elmo_bilstm") self.model_filepath = self.final_model_dir.joinpath("best_model.pt") self.data_dir = pathlib.Path(DATA_DIR) if not self.data_dir.is_dir(): self.data_dir.mkdir(parents=True) self.train_data_url = DATA_FILE_URLS["SECT_LABEL_TRAIN_FILE"] self.dev_data_url = DATA_FILE_URLS["SECT_LABEL_DEV_FILE"] self.test_data_url = DATA_FILE_URLS["SECT_LABEL_TEST_FILE"] self.msg_printer = wasabi.Printer() self._download_if_required() self.data_manager = self._get_data() self.hparams = self._get_hparams() self.model = self._get_model() self.infer = self._get_infer_client() self.cli_interact = SciWINGInteract(self.infer) self.log_file = log_file if log_file: self.logger = setup_logger("sectlabel_logger", logfile=self.log_file, level=logging.INFO) else: self.logger = self.msg_printer
def __init__( self, dropout_value: float = 0.5, datasets_manager: DatasetsManager = None, word_tokens_namespace: str = "tokens", device: torch.device = torch.device("cpu"), fine_tune: bool = False, ): super(ElmoEmbedder, self).__init__() # Sometimes you need two different tensors that are # two different linear combination of representations # TODO: change this in-case you need 2 representations self.num_output_representations = 1 self.dropout_value = dropout_value self.datasets_manager = datasets_manager self.device = torch.device(device) if isinstance(device, str) else device self.msg_printer = wasabi.Printer() self.word_tokens_namespace = word_tokens_namespace self.fine_tune = fine_tune self.embedder_name = "ElmoEmbedder" with self.msg_printer.loading("Loading Elmo Object"): self.elmo: nn.Module = Elmo( options_file=ELMO_OPTIONS_FILE, weight_file=ELMO_WEIGHTS_FILE, num_output_representations=self.num_output_representations, dropout=self.dropout_value, requires_grad=fine_tune, ) self.msg_printer.good(f"Finished Loading ELMO object")
def __init__(self): super(CitationIntentClassification, self).__init__() self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) if not self.models_cache_dir.is_dir(): self.models_cache_dir.mkdir(parents=True) self.final_model_dir = self.models_cache_dir.joinpath( "citation_intent_clf_elmo") self.data_dir = pathlib.Path(DATA_DIR) if not self.data_dir.is_dir(): self.data_dir.mkdir(parents=True) self.train_data_url = DATA_FILE_URLS["SCICITE_TRAIN"] self.dev_data_url = DATA_FILE_URLS["SCICITE_DEV"] self.test_data_url = DATA_FILE_URLS["SCICITE_TEST"] self.msg_printer = wasabi.Printer() self._download_if_required() self.hparams = self._get_hparams() self.data_manager = self._get_data() self.model: nn.Module = self._get_model() self.infer = self._get_infer_client() self.cli_interact = SciWINGInteract(infer_client=self.infer)
def __init__( self, model: nn.Module, model_filepath: str, datasets_manager: DatasetsManager, device: Optional[Union[str, torch.device]] = torch.device("cpu"), ): """ Parameters ---------- model : nn.Module A pytorch module model_filepath : str The path where the parameters for the best models are stored. This is usually the ``best_model.pt`` while in an experiment directory datasets_manager : DatasetsManager Any dataset that conforms to the pytorch Dataset specification device : Optional[Union[str, torch.device]] This is either a string like ``cpu``, ``cuda:0`` or a torch.device object """ self.model = model self.model_filepath = model_filepath self.datasets_manager = datasets_manager self.device = torch.device(device) if isinstance(device, str) else device self.msg_printer = wasabi.Printer()
def print_reminders(tools: List[dict]) -> None: storage_data = {} try: script_dir = os.path.dirname(__file__) storage_file_path = os.path.join(script_dir, "tool_reminder_storage.json") with open(storage_file_path, 'r') as json_file: storage_data = json.load(json_file) except FileNotFoundError: pass printer = wasabi.Printer() for tool in tools: tool_name = tool['name'] tool_list = tool['list'] storage_key = 'curr {0} index'.format(tool_name) curr_index = storage_data.get(storage_key, 0) tools_per_print = min(3, len(tool_list)) printer.warn(tool_name) # use warn because of the color for x in range(0, tools_per_print): printer.info(tool_list[curr_index]) curr_index = (curr_index + 1) % len(tool_list) if tool != tools[-1]: print() storage_data[storage_key] = curr_index with open(storage_file_path, 'w') as outfile: json.dump(storage_data, outfile)
def __init__( self, datasets_manager: DatasetsManager = None, layer_aggregation: str = "sum", device: Union[str, torch.device] = torch.device("cpu"), word_tokens_namespace="tokens", ): """ Bag of words Elmo Embedder which aggregates elmo embedding for every token Parameters ---------- layer_aggregation : str You can chose one of ``[sum, average, last, first]`` which decides how to aggregate different layers of ELMO. ELMO produces three layers of representations sum Representations from different layers are summed average Representations from different layers are average last Representations from last layer is considered first Representations from first layer is considered device : Union[str, torch.device] device for running the model on word_tokens_namespace: int Namespace where all the word tokens are stored """ super(BowElmoEmbedder, self).__init__() self.dataset_manager = datasets_manager self.embedding_dimension = self.get_embedding_dimension() self.embedder_name = "elmo" self.word_tokens_namespace = word_tokens_namespace self.layer_aggregation_type = layer_aggregation self.allowed_layer_aggregation_types = [ "sum", "average", "last", "first" ] self.device = (torch.device(device) if isinstance(device, str) else torch.device(device)) if self.device.index: self.cuda_device_id = self.device.index else: self.cuda_device_id = -1 self.msg_printer = wasabi.Printer() assert ( self.layer_aggregation_type in self.allowed_layer_aggregation_types ), self.msg_printer.fail( f"For bag of words elmo encoder, the allowable aggregation " f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}" ) # load the elmo embedders with self.msg_printer.loading("Creating Elmo object"): self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id) self.msg_printer.good("Finished Loading Elmo object")
def __init__(self, infer_client: BaseInterfaceClient): if isinstance(infer_client, BaseInterfaceClient): self.infer_obj = infer_client.build_infer() else: # You can pass the infer obj directly # Refer to sciwing.infer.seq_label.BaseSeqLabelInference or sciwing.infer.seq_label.BaseClassificationInference self.infer_obj = infer_client self.msg_printer = wasabi.Printer()
def __init__( self, embedder, dropout_value: float = 0.0, hidden_dim: int = 1024, bidirectional: bool = False, combine_strategy: str = "concat", rnn_bias: bool = True, device: Union[str, torch.device] = torch.device("cpu"), ): """LSTM2Vec encoder that encodes a series of tokens to a single vector representation Parameters ---------- embedder : nn.Module Any embedder can be passed dropout_value : float The dropout value for input embeddings hidden_dim : int The hidden dimension for the LSTM bidirectional : bool Whether the LSTM is bidirectional or no combine_strategy : str Strategy to combine the vectors from two different directions rnn_bias : str Whether to use the bias layer in RNN. Should be set to false only for debugging purposes device : Union[str, torch.device] The device on which the model is run """ super(LSTM2VecEncoder, self).__init__() self.embedder = embedder self.emb_dim = embedder.get_embedding_dimension() self.dropout_value = dropout_value self.hidden_dimension = hidden_dim self.bidirectional = bidirectional self.num_directions = 2 if self.bidirectional else 1 self.num_layers = 1 self.combine_strategy = combine_strategy self.allowed_combine_strategies = ["sum", "concat"] self.rnn_bias = rnn_bias self.device = torch.device(device) if isinstance(device, str) else device self.msg_printer = wasabi.Printer() assert (self.combine_strategy in self.allowed_combine_strategies), self.msg_printer.fail( f"The combine strategies can be one of " f"{self.allowed_combine_strategies}. You passed " f"{self.combine_strategy}") self.emb_dropout = nn.Dropout(p=self.dropout_value) self.rnn = nn.LSTM( input_size=self.emb_dim, hidden_size=self.hidden_dimension, bias=self.rnn_bias, batch_first=True, bidirectional=self.bidirectional, )
def write_pubmed_data_to_sciwing_seq2seq(pubmed_dir: str, subset: str, out_filename: str): """ SciCite files are jsonl filenames with citation strings. Parameters ---------- pubmed_dir : str The directory path to where pubmed dataset subset : str Choose from train, test and val out_filename : str Output file name Returns ------- None """ printer = wasabi.Printer() # inputs = [] # abstracts = [] lines = [] text_dir = os.path.join(pubmed_dir, "inputs", subset) abstract_dir = os.path.join(pubmed_dir, "human-abstracts", subset) filename_list = [ filename.split(".")[0] for filename in os.listdir(text_dir) ] print(f"Reading pubmed {subset} data") for filename in tqdm(filename_list): with open(os.path.join(abstract_dir, f"{filename}.txt"), "r") as fp: abstract = fp.read() # abstracts.append(abstract) with open(os.path.join(text_dir, f"{filename}.json"), "r") as fp: input = json.load(fp) # inputs.append(input) abstract = abstract.strip().replace("\n", " ") text = " ".join([text["text"] for text in input["inputs"]]) text = text.strip().replace("\n", " ") if bool(text) and bool(abstract): line = "###".join([text, abstract]) lines.append(line) print(f"Writing pubmed {subset} data") with open(os.path.join(pubmed_dir, out_filename), "w") as fp: for line in lines: fp.write(line) fp.write("\n") printer.good(f"Finished writing {out_filename}")
def __init__( self, emb_dim: int = 1024, dropout_value: float = 0.0, layer_aggregation: str = "sum", cuda_device_id: int = -1, ): """ Bag of words Elmo Embedder which aggregates elmo embedding for every token Parameters ---------- emb_dim : int Embedding dimension dropout_value : float Any input dropout to be applied to the embeddings layer_aggregation : str You can chose one of ``[sum, average, last, first]`` which decides how to aggregate different layers of ELMO. ELMO produces three layers of representations sum Representations from different layers are summed average Representations from different layers are average last Representations from last layer is considered first Representations from first layer is considered cuda_device_id : int Cuda device id on which representations will be transferred -1 indicates cpu """ super(BowElmoEmbedder, self).__init__() self.emb_dim = emb_dim self.dropout_value = dropout_value self.layer_aggregation_type = layer_aggregation self.allowed_layer_aggregation_types = [ "sum", "average", "last", "first" ] self.cuda_device_id = cuda_device_id self.device = (torch.device("cpu") if cuda_device_id < 0 else torch.device(f"cuda:{cuda_device_id}")) self.msg_printer = wasabi.Printer() assert ( self.layer_aggregation_type in self.allowed_layer_aggregation_types ), self.msg_printer.fail( f"For bag of words elmo encoder, the allowable aggregation " f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}" ) # load the elmo embedders with self.msg_printer.loading("Creating Elmo object"): self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id) self.msg_printer.good("Finished Loading Elmo object")
def __init__(self, hparams: Dict[str, Any]): self.hparams = hparams data_dir = pathlib.Path(DATA_DIR) self.train_filename = data_dir.joinpath("eng.train") self.dev_filename = data_dir.joinpath("eng.testa") self.test_filename = data_dir.joinpath("eng.testb") self.printer = wasabi.Printer() self.data_manager = self.build_dataset() self.model = self.build_model() self.infer = self.build_infer()
def __init__( self, token2idx: Dict, embedding_type: Union[str, None] = None, embedding_dimension: Union[str, None] = None, ): self.token2idx = token2idx self.embedding_type = embedding_type self.embedding_dimension = embedding_dimension self.msg_printer = wasabi.Printer() self.vocab_embedding = self.load_embedding()
def write_extractive_to_sciwing_text_clf(extractive_data_dir: str, data_group: str, out_filename: str): """ The preprocessed extractive summarization dataset contains 3 folders: human-abstracts (ground truth), inputs (document id, original sentences, tokenized sentences), labels (document id, labels of each sentence in the document indicating whether this sentence should be included in the summary). Each folder has 3 sub-folders: train, test, val. Each json file under the sub-folder contains one document. Parameters ---------- extractive_data_dir : str The directory where all the data file are stored data_group: Choose from train, dev, and test. Dev is corresponding to the val folder in the input data out_filename: The output filename where the extractive summarization dataset is stored Returns ------- None """ printer = wasabi.Printer() document = [] input_data_human_abstract_dir = Path(extractive_data_dir, "human-abstracts", data_group) input_data_inputs_dir = Path(extractive_data_dir, "inputs", data_group) input_data_labels_dir = Path(extractive_data_dir, "labels", data_group) filename_list = [f.stem for f in input_data_human_abstract_dir.iterdir()] with printer.loading(f"Writing f{out_filename}"): for filename in filename_list: ha_filename = input_data_human_abstract_dir.joinpath( f"{filename}.text") input_filename = input_data_inputs_dir.joinpath(f"{filename}.json") label_filename = input_data_labels_dir.joinpath(f"{filename}.json") with open(ha_filename, "r") as fp: abstract_str = fp.read().strip() abstract_str = abstract_str.strip().replace("\n", " ") with open(input_filename, "r") as fp: input_dict = json.load(fp) input_str = [ sent["text"].strip().remove("\n") for sent in input_dict["inputs"] ] with open(label_filename, "r") as fp: label_dict = json.load(fp) label_str = []
def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None): super(TokenClassificationAccuracy, self).__init__() self.idx2labelname_mapping = idx2labelname_mapping self.msg_printer = wasabi.Printer() self.classification_metrics_utils = ClassificationMetricsUtils( idx2labelname_mapping=idx2labelname_mapping) self.tp_counter = {} self.fp_counter = {} self.fn_counter = {} self.tn_counter = {}
def __init__( self, train_dataset: Dataset, dev_dataset: Dataset = None, test_dataset: Dataset = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 32, ): """ Parameters ---------- train_dataset : Dataset A pytorch dataset that represents training data dev_dataset : Dataset A pytorch dataset that represents validation data test_dataset : Dataset A pytorch dataset that represents test data namespace_vocab_options : Dict[str, Dict[str, Any]] For every namespace you can give a set of options that will be passed down to Vocab. namespace_numericalizer_map: Dict[str, Dict[str, Any]] For every namespace, you can give a set of options here that will be passed down to the Numericalizer Instances batch_size: int Batch size for loading the datasets """ self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.label_namespaces: List[str] = None # Holds the label namespaces self.msg_printer = wasabi.Printer() if namespace_vocab_options is None: self.namespace_vocab_options = {} else: self.namespace_vocab_options = namespace_vocab_options self.batch_size = batch_size self.namespace_to_numericalizer: Dict[ str, BaseNumericalizer] = namespace_numericalizer_map # Build vocab using the datasets passed self.namespace_to_vocab: Dict[str, Vocab] = self.build_vocab() # sets the vocab for the appropriate numericalizers self.namespace_to_numericalizer = self.build_numericalizers() self.namespaces = list(self.namespace_to_vocab.keys()) self.num_labels = {} for namespace in self.label_namespaces: vocab = self.namespace_to_vocab[namespace] self.num_labels[namespace] = vocab.get_vocab_len()
def __init__(self): self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) self.final_model_dir = self.models_cache_dir.joinpath("genericsect_bow_elmo") self.model_filepath = self.final_model_dir.joinpath("best_model.pt") self.data_dir = pathlib.Path(DATA_DIR) self.msg_printer = wasabi.Printer() self._download_if_required() self.data_manager = self._get_data() self.hparams = self._get_hparams() self.model = self._get_model() self.infer = self._get_infer_client()
def __init__(self): super(CitationIntentClassification, self).__init__() self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) self.final_model_dir = self.models_cache_dir.joinpath( "citation_intent_clf_elmo", "checkpoints") self.data_dir = pathlib.Path(DATA_DIR) self.msg_printer = wasabi.Printer() self._download_if_required() self.hparams = self._get_hparams() self.data_manager = self._get_data() self.model: nn.Module = self._get_model() self.infer = self._get_infer_client()
def __init__( self, filename: str, dataset_type: str, max_num_words: int, max_instance_length: int, word_vocab_store_location: str, max_char_length: Optional[int] = None, char_vocab_store_location: Optional[str] = None, captialization_vocab_store_location: Optional[str] = None, capitalization_emb_dim: Optional[str] = None, debug: bool = False, debug_dataset_proportion: float = 0.1, word_embedding_type: Union[str, None] = None, word_embedding_dimension: Union[int, None] = None, char_embedding_dimension: Union[int, None] = None, word_start_token: str = "<SOS>", word_end_token: str = "<EOS>", word_pad_token: str = "<PAD>", word_unk_token: str = "<UNK>", train_size: float = 0.8, test_size: float = 0.2, validation_size: float = 0.5, word_tokenization_type="vanilla", word_add_start_end_token: bool = True, max_num_chars: Optional[int] = 10000, char_embedding_type: str = "random", char_unk_token: str = " ", char_pad_token: str = " ", char_end_token: str = " ", char_start_token: str = " ", ): self.filename = filename self.train_size = train_size self.test_size = test_size self.validation_size = validation_size self.dataset_type = dataset_type self.debug = debug self.debug_dataset_proportion = debug_dataset_proportion self.max_instance_length = max_instance_length self.word_add_start_end_token = word_add_start_end_token self.classnames2idx = self.get_classname2idx() self.instance_preprocessor = None self.idx2classname = { idx: classname for classname, idx in self.classnames2idx.items() } self.lines, self.labels = self.get_lines_labels(filename) self.msg_printer = wasabi.Printer() self.tag_visualizer = VisTagging()
def __init__(self): super(NeuralParscit, self).__init__() self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR) self.final_model_dir = self.models_cache_dir.joinpath( "lstm_crf_parscit_final") self.model_filepath = self.final_model_dir.joinpath("best_model.pt") self.data_dir = pathlib.Path(DATA_DIR) self.msg_printer = wasabi.Printer() self._download_if_required() self.hparams = self._get_hparams() self.data_manager = self._get_data() self.model: nn.Module = self._get_model() self.infer = self._get_infer_client() self.vis_tagger = VisTagging()
def __init__( self, model: nn.Module, model_filepath: str, datasets_manager: DatasetsManager, device: Optional[Union[str, torch.device]] = torch.device("cpu"), predicted_tags_namespace_prefix: str = "predicted_tags", ): super(SequenceLabellingInference, self).__init__( model=model, model_filepath=model_filepath, datasets_manager=datasets_manager, device=device, ) self.predicted_tags_namespace_prefix = predicted_tags_namespace_prefix self.labels_namespaces = self.datasets_manager.label_namespaces self.msg_printer = wasabi.Printer() self.metrics_calculator = TokenClassificationAccuracy( datasets_manager=datasets_manager) # The key is the namespace of different labels # The value is a dictioary of label->idx self.label2idx_mapping: Dict[str, Dict[str, Any]] = {} self.idx2label_mapping: Dict[str, Dict[str, Any]] = {} for namespace in self.labels_namespaces: self.label2idx_mapping[ namespace] = self.datasets_manager.get_label_idx_mapping( label_namespace=namespace) self.idx2label_mapping[ namespace] = self.datasets_manager.get_idx_label_mapping( label_namespace=namespace) self.output_analytics = None self.output_df = None self.batch_size = 32 self.load_model() self.namespace_to_unique_categories = {} self.namespace_to_visualizer = {} for namespace in self.labels_namespaces: categories = list( set([ label for label in self.label2idx_mapping[namespace].keys() ])) visualizer = VisTagging(tags=categories) self.namespace_to_unique_categories[namespace] = categories self.namespace_to_visualizer[namespace] = visualizer
def __init__(self, foldername: str): """ Provides an interactive way to move some folders to s3 Parameters ---------- foldername : str The folder name which will be moved to S3 bucket """ self.foldername = foldername self.s3_config_json_filename = os.path.join(AWS_CRED_DIR, "aws_s3_credentials.json") self.s3_util = S3Util( aws_cred_config_json_filename=self.s3_config_json_filename) self.msg_printer = wasabi.Printer() self.interact()
def __init__(self, dataset_name: str): """ Parameters ---------- dataset_name : str The class name of the dataset that will be generated """ self.dataset_name = dataset_name self.template_file = pathlib.Path( TEMPLATES_DIR, "classification_dataset_template.txt") self.msg_printer = wasabi.Printer() self.template = self._get_template() self.template_variables = self.interact()
def __init__(self, toml_filename: pathlib.Path, infer: bool = False): self.toml_filename = toml_filename self.infer = infer self.msg_printer = wasabi.Printer() self.doc = self._parse_toml_file() self.experiment_name = None self.experiment_dir = None # Dict {'train': Dataset, 'valid': Dataset, 'test': Dataset} self.all_datasets = None self.model_section = None self.dataset_section = None self.engine_section = None self.model = None self.engine = None self.model_dag = nx.DiGraph()
def __init__(self, toml_filename: pathlib.Path, infer: bool = False): self.toml_filename = toml_filename self.infer = infer self.msg_printer = wasabi.Printer() self.doc = self._parse_toml_file() self.data_dir = pathlib.Path(DATA_DIR) self.experiment_name = None self.experiment_dir = None self.datasets_manager = None self.model_section = None self.dataset_section = None self.engine_section = None self.model = None self.engine = None self.model_dag = nx.DiGraph()