Exemplo n.º 1
0
def convert_conll2003_ner_to_bioul(filename: str, out_filename: str):
    """ Converts the conll2003 file to bilou tagged strings
    and writes it to out_filename

    The out_filename will have the first column as word and
    the next three columns as the NER tags

    Parameters
    ----------
    filename: str
        Convert the file in conll2003 format to bioul tags
    out_filename: str
        Writes the file to bioul format

    Returns
    -------
    None

    """
    msg_printer = wasabi.Printer()
    lines: List[List[str]] = []
    labels: List[List[str]] = []

    with open(filename) as fp:
        lines_: List[str] = []
        labels_: List[str] = []  # every list is a label for one namespace
        for text in fp:
            text_ = text.strip()
            if bool(text_):
                line_labels = text_.split()
                line_ = line_labels[0]
                label_ = line_labels[3]  # all 3 tags
                lines_.append(line_)
                labels_.append(label_)
            elif text_ == "-DOCSTART-":
                # skip next empty line as well
                lines_ = []
                labels_ = []
                next(fp)
            else:
                if len(lines_) > 0 and len(labels_) > 0:
                    lines.append(lines_)
                    labels.append(labels_)
                    lines_ = []
                    labels_ = []
    bilou_tags = []
    for label in labels:
        bilou_ = to_bioul(tag_sequence=label, encoding="IOB1")
        bilou_tags.append(bilou_)

    with msg_printer.loading(f"writing BILOU tags for {filename}"):
        with open(out_filename, "w") as fp:
            for line, bilou_tags_ in zip(lines, bilou_tags):
                assert len(line) == len(bilou_tags_)
                for word, tag in zip(line, bilou_tags_):
                    fp.write(" ".join([word, tag, tag, tag]))
                    fp.write("\n")

                fp.write("\n")
    msg_printer.good(f"Finished writing BILOU tags for {filename}")
Exemplo n.º 2
0
    def __init__(self, bert_type: str):
        super(TokenizerForBert, self).__init__()
        self.bert_type = bert_type
        self.msg_printer = wasabi.Printer()
        self.allowed_bert_types = [
            "bert-base-uncased",
            "bert-large-uncased",
            "bert-base-cased",
            "bert-large-cased",
            "scibert-base-cased",
            "scibert-sci-cased",
            "scibert-base-uncased",
            "scibert-sci-uncased",
        ]
        self.scibert_foldername_mapping = {
            "scibert-base-cased": "scibert_basevocab_cased",
            "scibert-sci-cased": "scibert_scivocab_cased",
            "scibert-base-uncased": "scibert_basevocab_uncased",
            "scibert-sci-uncased": "scibert_scivocab_uncased",
        }
        assert bert_type in self.allowed_bert_types, self.msg_printer.fail(
            f"You passed {bert_type} for attribute bert_type."
            f"The allowed types are {self.allowed_bert_types}")
        self.vocab_type_or_filename = None
        if "scibert" in self.bert_type:
            foldername = self.scibert_foldername_mapping[self.bert_type]
            self.vocab_type_or_filename = os.path.join(EMBEDDING_CACHE_DIR,
                                                       foldername, "vocab.txt")
        else:
            self.vocab_type_or_filename = self.bert_type

        with self.msg_printer.loading("Loading Bert model"):
            self.tokenizer = BertTokenizer.from_pretrained(
                self.vocab_type_or_filename)
Exemplo n.º 3
0
    def __init__(self):
        self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)
        self.final_model_dir = self.models_cache_dir.joinpath(
            "genericsect_bow_elmo")

        if not self.models_cache_dir.is_dir():
            self.models_cache_dir.mkdir(parents=True)

        self.model_filepath = self.final_model_dir.joinpath("best_model.pt")
        self.data_dir = pathlib.Path(DATA_DIR)

        if not self.data_dir.is_dir():
            self.data_dir.mkdir(parents=True)

        self.train_data_url = DATA_FILE_URLS["GENERIC_SECTION_TRAIN_FILE"]
        self.dev_data_url = DATA_FILE_URLS["GENERIC_SECTION_DEV_FILE"]
        self.test_data_url = DATA_FILE_URLS["GENERIC_SECTION_TEST_FILE"]

        self.msg_printer = wasabi.Printer()
        self._download_if_required()
        self.data_manager = self._get_data()
        self.hparams = self._get_hparams()
        self.model = self._get_model()
        self.infer = self._get_infer_client()
        self.cli_interact = SciWINGInteract(self.infer)
Exemplo n.º 4
0
    def __init__(
        self,
        encoder: BowElmoEmbedder,
        encoding_dim: int,
        num_classes: int,
        classification_layer_bias: bool = True,
    ):
        """
        Parameters
        ----------
        encoder : BowElmoEmbedder
            Bag of words Elmo Embedder, that embeds words by aggregating
            elmo representations across words either by summing or averaging
            or any other strategy
        encoding_dim : int
            Dimension of the encoding of text
        num_classes : int
            Number of classes in the dataset
        classification_layer_bias : bool
            whether to add bias to the classification layer
            This is set to false only for testing or debugging purpose
            Else please keep this as true
        """
        super(BowElmoLinearClassifier, self).__init__()
        self.encoder = encoder
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        self.classification_layer_bias = classification_layer_bias

        self.classification_layer = nn.Linear(
            encoding_dim, num_classes, bias=self.classification_layer_bias)
        self._loss = CrossEntropyLoss()
        self.msg_printer = wasabi.Printer()
Exemplo n.º 5
0
Arquivo: i2b2.py Projeto: yyht/sciwing
    def __init__(self):
        super(I2B2NER, self).__init__()
        self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)

        if not self.models_cache_dir.is_dir():
            self.models_cache_dir.mkdir(parents=True)

        self.final_model_dir = self.models_cache_dir.joinpath("i2b2")
        self.model_filepath = self.final_model_dir.joinpath("best_model.pt")
        self.data_dir = pathlib.Path(DATA_DIR)

        if not self.data_dir.is_dir():
            self.data_dir.mkdir()

        self.train_data_url = DATA_FILE_URLS["I2B2_TRAIN"]
        self.dev_data_url = DATA_FILE_URLS["I2B2_DEV"]
        self.test_data_url = DATA_FILE_URLS["I2B2_DEV"]
        self.msg_printer = wasabi.Printer()
        self._download_if_required()
        self.hparams = self._get_hparams()
        self.data_manager = self._get_data()
        self.model: nn.Module = self._get_model()
        self.infer = self._get_infer_client()
        self.vis_tagger = VisTagging()
        self.cli_interact = SciWINGInteract(self.infer)
Exemplo n.º 6
0
    def __init__(
            self,
            model: nn.Module,
            model_filepath: str,
            dataset: BaseSeqLabelingDataset,
            device: Optional[Union[str, torch.device]] = torch.device("cpu"),
    ):

        super(ParscitInference, self).__init__(model=model,
                                               model_filepath=model_filepath,
                                               dataset=dataset,
                                               device=device)

        self.msg_printer = wasabi.Printer()
        self.labelname2idx_mapping = self.dataset.get_classname2idx()
        self.idx2labelname_mapping = {
            idx: label_name
            for label_name, idx in self.labelname2idx_mapping.items()
        }
        self.metrics_calculator = TokenClassificationAccuracy(
            idx2labelname_mapping=self.idx2labelname_mapping)
        self.output_analytics = None
        self.output_df = None
        self.batch_size = 32
        self.load_model()

        num_categories = self.dataset.get_num_classes()
        categories = [
            self.idx2labelname_mapping[idx] for idx in range(num_categories)
        ]
        self.seq_tagging_visualizer = VisTagging(tags=categories)
Exemplo n.º 7
0
    def __init__(self, log_file: str = None, device: str = "cpu"):
        self.device = device
        self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)

        if not self.models_cache_dir.is_dir():
            self.models_cache_dir.mkdir(parents=True)

        self.final_model_dir = self.models_cache_dir.joinpath(
            "sectlabel_elmo_bilstm")
        self.model_filepath = self.final_model_dir.joinpath("best_model.pt")
        self.data_dir = pathlib.Path(DATA_DIR)

        if not self.data_dir.is_dir():
            self.data_dir.mkdir(parents=True)

        self.train_data_url = DATA_FILE_URLS["SECT_LABEL_TRAIN_FILE"]
        self.dev_data_url = DATA_FILE_URLS["SECT_LABEL_DEV_FILE"]
        self.test_data_url = DATA_FILE_URLS["SECT_LABEL_TEST_FILE"]

        self.msg_printer = wasabi.Printer()
        self._download_if_required()
        self.data_manager = self._get_data()
        self.hparams = self._get_hparams()
        self.model = self._get_model()
        self.infer = self._get_infer_client()
        self.cli_interact = SciWINGInteract(self.infer)
        self.log_file = log_file

        if log_file:
            self.logger = setup_logger("sectlabel_logger",
                                       logfile=self.log_file,
                                       level=logging.INFO)
        else:
            self.logger = self.msg_printer
Exemplo n.º 8
0
    def __init__(
            self,
            dropout_value: float = 0.5,
            datasets_manager: DatasetsManager = None,
            word_tokens_namespace: str = "tokens",
            device: torch.device = torch.device("cpu"),
            fine_tune: bool = False,
    ):
        super(ElmoEmbedder, self).__init__()

        # Sometimes you need two different tensors that are
        # two different linear combination of representations
        # TODO: change this in-case you need 2 representations
        self.num_output_representations = 1
        self.dropout_value = dropout_value
        self.datasets_manager = datasets_manager
        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = wasabi.Printer()
        self.word_tokens_namespace = word_tokens_namespace
        self.fine_tune = fine_tune
        self.embedder_name = "ElmoEmbedder"

        with self.msg_printer.loading("Loading Elmo Object"):
            self.elmo: nn.Module = Elmo(
                options_file=ELMO_OPTIONS_FILE,
                weight_file=ELMO_WEIGHTS_FILE,
                num_output_representations=self.num_output_representations,
                dropout=self.dropout_value,
                requires_grad=fine_tune,
            )

        self.msg_printer.good(f"Finished Loading ELMO object")
Exemplo n.º 9
0
    def __init__(self):
        super(CitationIntentClassification, self).__init__()
        self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)

        if not self.models_cache_dir.is_dir():
            self.models_cache_dir.mkdir(parents=True)

        self.final_model_dir = self.models_cache_dir.joinpath(
            "citation_intent_clf_elmo")

        self.data_dir = pathlib.Path(DATA_DIR)

        if not self.data_dir.is_dir():
            self.data_dir.mkdir(parents=True)

        self.train_data_url = DATA_FILE_URLS["SCICITE_TRAIN"]
        self.dev_data_url = DATA_FILE_URLS["SCICITE_DEV"]
        self.test_data_url = DATA_FILE_URLS["SCICITE_TEST"]
        self.msg_printer = wasabi.Printer()
        self._download_if_required()
        self.hparams = self._get_hparams()
        self.data_manager = self._get_data()
        self.model: nn.Module = self._get_model()
        self.infer = self._get_infer_client()
        self.cli_interact = SciWINGInteract(infer_client=self.infer)
Exemplo n.º 10
0
    def __init__(
            self,
            model: nn.Module,
            model_filepath: str,
            datasets_manager: DatasetsManager,
            device: Optional[Union[str, torch.device]] = torch.device("cpu"),
    ):
        """

        Parameters
        ----------
        model : nn.Module
            A pytorch module
        model_filepath : str
            The path where the parameters for the best models are stored. This is usually
            the ``best_model.pt`` while in an experiment directory
        datasets_manager : DatasetsManager
            Any dataset that conforms to the pytorch Dataset specification
        device : Optional[Union[str, torch.device]]
            This is either a string like ``cpu``, ``cuda:0`` or a torch.device object
        """
        self.model = model
        self.model_filepath = model_filepath
        self.datasets_manager = datasets_manager

        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = wasabi.Printer()
Exemplo n.º 11
0
def print_reminders(tools: List[dict]) -> None:
    storage_data = {}

    try:
        script_dir = os.path.dirname(__file__)
        storage_file_path = os.path.join(script_dir,
                                         "tool_reminder_storage.json")

        with open(storage_file_path, 'r') as json_file:
            storage_data = json.load(json_file)
    except FileNotFoundError:
        pass

    printer = wasabi.Printer()

    for tool in tools:
        tool_name = tool['name']
        tool_list = tool['list']
        storage_key = 'curr {0} index'.format(tool_name)
        curr_index = storage_data.get(storage_key, 0)
        tools_per_print = min(3, len(tool_list))

        printer.warn(tool_name)  # use warn because of the color
        for x in range(0, tools_per_print):
            printer.info(tool_list[curr_index])
            curr_index = (curr_index + 1) % len(tool_list)
        if tool != tools[-1]:
            print()

        storage_data[storage_key] = curr_index

    with open(storage_file_path, 'w') as outfile:
        json.dump(storage_data, outfile)
Exemplo n.º 12
0
    def __init__(
        self,
        datasets_manager: DatasetsManager = None,
        layer_aggregation: str = "sum",
        device: Union[str, torch.device] = torch.device("cpu"),
        word_tokens_namespace="tokens",
    ):
        """ Bag of words Elmo Embedder which aggregates elmo embedding for every token

        Parameters
        ----------
        layer_aggregation : str
            You can chose one of ``[sum, average, last, first]``
            which decides how to aggregate different layers of ELMO. ELMO produces three
            layers of representations

            sum
                Representations from different layers are summed
            average
                Representations from different layers are average
            last
                Representations from last layer is considered
            first
                Representations from first layer is considered

        device : Union[str, torch.device]
            device for running the model on

        word_tokens_namespace: int
            Namespace where all the word tokens are stored
        """
        super(BowElmoEmbedder, self).__init__()
        self.dataset_manager = datasets_manager
        self.embedding_dimension = self.get_embedding_dimension()
        self.embedder_name = "elmo"
        self.word_tokens_namespace = word_tokens_namespace
        self.layer_aggregation_type = layer_aggregation
        self.allowed_layer_aggregation_types = [
            "sum", "average", "last", "first"
        ]
        self.device = (torch.device(device)
                       if isinstance(device, str) else torch.device(device))

        if self.device.index:
            self.cuda_device_id = self.device.index
        else:
            self.cuda_device_id = -1
        self.msg_printer = wasabi.Printer()

        assert (
            self.layer_aggregation_type in self.allowed_layer_aggregation_types
        ), self.msg_printer.fail(
            f"For bag of words elmo encoder, the allowable aggregation "
            f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}"
        )

        # load the elmo embedders
        with self.msg_printer.loading("Creating Elmo object"):
            self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id)
        self.msg_printer.good("Finished Loading Elmo object")
Exemplo n.º 13
0
 def __init__(self, infer_client: BaseInterfaceClient):
     if isinstance(infer_client, BaseInterfaceClient):
         self.infer_obj = infer_client.build_infer()
     else:
         # You can pass the infer obj directly
         # Refer to sciwing.infer.seq_label.BaseSeqLabelInference or sciwing.infer.seq_label.BaseClassificationInference
         self.infer_obj = infer_client
     self.msg_printer = wasabi.Printer()
Exemplo n.º 14
0
    def __init__(
            self,
            embedder,
            dropout_value: float = 0.0,
            hidden_dim: int = 1024,
            bidirectional: bool = False,
            combine_strategy: str = "concat",
            rnn_bias: bool = True,
            device: Union[str, torch.device] = torch.device("cpu"),
    ):
        """LSTM2Vec encoder that encodes a series of tokens to a single vector representation

        Parameters
        ----------
        embedder : nn.Module
            Any embedder can be passed
        dropout_value : float
            The dropout value for input embeddings
        hidden_dim : int
            The hidden dimension for the LSTM
        bidirectional : bool
            Whether the LSTM is bidirectional or no
        combine_strategy : str
            Strategy to combine the vectors from two different directions
        rnn_bias : str
            Whether to use the bias layer in RNN. Should be set to false only for debugging purposes
        device : Union[str, torch.device]
            The device on which the model is run
        """
        super(LSTM2VecEncoder, self).__init__()
        self.embedder = embedder
        self.emb_dim = embedder.get_embedding_dimension()
        self.dropout_value = dropout_value
        self.hidden_dimension = hidden_dim
        self.bidirectional = bidirectional
        self.num_directions = 2 if self.bidirectional else 1
        self.num_layers = 1
        self.combine_strategy = combine_strategy
        self.allowed_combine_strategies = ["sum", "concat"]
        self.rnn_bias = rnn_bias
        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = wasabi.Printer()

        assert (self.combine_strategy
                in self.allowed_combine_strategies), self.msg_printer.fail(
                    f"The combine strategies can be one of "
                    f"{self.allowed_combine_strategies}. You passed "
                    f"{self.combine_strategy}")

        self.emb_dropout = nn.Dropout(p=self.dropout_value)
        self.rnn = nn.LSTM(
            input_size=self.emb_dim,
            hidden_size=self.hidden_dimension,
            bias=self.rnn_bias,
            batch_first=True,
            bidirectional=self.bidirectional,
        )
Exemplo n.º 15
0
def write_pubmed_data_to_sciwing_seq2seq(pubmed_dir: str, subset: str,
                                         out_filename: str):
    """ SciCite files are jsonl filenames with citation strings.

    Parameters
    ----------
    pubmed_dir : str
        The directory path to where pubmed dataset

    subset : str
        Choose from train, test and val

    out_filename : str
        Output file name

    Returns
    -------
    None

    """
    printer = wasabi.Printer()
    # inputs = []
    # abstracts = []
    lines = []

    text_dir = os.path.join(pubmed_dir, "inputs", subset)
    abstract_dir = os.path.join(pubmed_dir, "human-abstracts", subset)
    filename_list = [
        filename.split(".")[0] for filename in os.listdir(text_dir)
    ]

    print(f"Reading pubmed {subset} data")
    for filename in tqdm(filename_list):
        with open(os.path.join(abstract_dir, f"{filename}.txt"), "r") as fp:
            abstract = fp.read()
            # abstracts.append(abstract)

        with open(os.path.join(text_dir, f"{filename}.json"), "r") as fp:
            input = json.load(fp)
            # inputs.append(input)

        abstract = abstract.strip().replace("\n", " ")
        text = " ".join([text["text"] for text in input["inputs"]])
        text = text.strip().replace("\n", " ")

        if bool(text) and bool(abstract):
            line = "###".join([text, abstract])
            lines.append(line)

    print(f"Writing pubmed {subset} data")
    with open(os.path.join(pubmed_dir, out_filename), "w") as fp:
        for line in lines:
            fp.write(line)
            fp.write("\n")

    printer.good(f"Finished writing {out_filename}")
Exemplo n.º 16
0
    def __init__(
        self,
        emb_dim: int = 1024,
        dropout_value: float = 0.0,
        layer_aggregation: str = "sum",
        cuda_device_id: int = -1,
    ):
        """ Bag of words Elmo Embedder which aggregates elmo embedding for every token

        Parameters
        ----------
        emb_dim : int
            Embedding dimension
        dropout_value : float
            Any input dropout to be applied to the embeddings
        layer_aggregation : str
            You can chose one of ``[sum, average, last, first]``
            which decides how to aggregate different layers of ELMO. ELMO produces three
            layers of representations

            sum
                Representations from different layers are summed
            average
                Representations from different layers are average
            last
                Representations from last layer is considered
            first
                Representations from first layer is considered

        cuda_device_id : int
            Cuda device id on which representations will be transferred
            -1 indicates cpu
        """
        super(BowElmoEmbedder, self).__init__()
        self.emb_dim = emb_dim
        self.dropout_value = dropout_value
        self.layer_aggregation_type = layer_aggregation
        self.allowed_layer_aggregation_types = [
            "sum", "average", "last", "first"
        ]
        self.cuda_device_id = cuda_device_id
        self.device = (torch.device("cpu") if cuda_device_id < 0 else
                       torch.device(f"cuda:{cuda_device_id}"))
        self.msg_printer = wasabi.Printer()

        assert (
            self.layer_aggregation_type in self.allowed_layer_aggregation_types
        ), self.msg_printer.fail(
            f"For bag of words elmo encoder, the allowable aggregation "
            f"types are {self.allowed_layer_aggregation_types}. You passed {self.layer_aggregation_type}"
        )

        # load the elmo embedders
        with self.msg_printer.loading("Creating Elmo object"):
            self.elmo = ElmoEmbedder(cuda_device=self.cuda_device_id)
        self.msg_printer.good("Finished Loading Elmo object")
Exemplo n.º 17
0
 def __init__(self, hparams: Dict[str, Any]):
     self.hparams = hparams
     data_dir = pathlib.Path(DATA_DIR)
     self.train_filename = data_dir.joinpath("eng.train")
     self.dev_filename = data_dir.joinpath("eng.testa")
     self.test_filename = data_dir.joinpath("eng.testb")
     self.printer = wasabi.Printer()
     self.data_manager = self.build_dataset()
     self.model = self.build_model()
     self.infer = self.build_infer()
Exemplo n.º 18
0
 def __init__(
     self,
     token2idx: Dict,
     embedding_type: Union[str, None] = None,
     embedding_dimension: Union[str, None] = None,
 ):
     self.token2idx = token2idx
     self.embedding_type = embedding_type
     self.embedding_dimension = embedding_dimension
     self.msg_printer = wasabi.Printer()
     self.vocab_embedding = self.load_embedding()
def write_extractive_to_sciwing_text_clf(extractive_data_dir: str,
                                         data_group: str, out_filename: str):
    """
    The preprocessed extractive summarization dataset contains 3 folders:
        human-abstracts (ground truth),
        inputs (document id, original sentences, tokenized sentences),
        labels (document id, labels of each sentence in the document indicating whether this sentence should be included
                in the summary).
    Each folder has 3 sub-folders: train, test, val. Each json file under the sub-folder contains one document.

    Parameters
    ----------
    extractive_data_dir : str
        The directory where all the data file are stored

    data_group:
        Choose from train, dev, and test. Dev is corresponding to the val folder in the input data

    out_filename:
        The output filename where the extractive summarization dataset is stored

    Returns
    -------
    None
    """
    printer = wasabi.Printer()
    document = []
    input_data_human_abstract_dir = Path(extractive_data_dir,
                                         "human-abstracts", data_group)
    input_data_inputs_dir = Path(extractive_data_dir, "inputs", data_group)
    input_data_labels_dir = Path(extractive_data_dir, "labels", data_group)
    filename_list = [f.stem for f in input_data_human_abstract_dir.iterdir()]

    with printer.loading(f"Writing f{out_filename}"):
        for filename in filename_list:
            ha_filename = input_data_human_abstract_dir.joinpath(
                f"{filename}.text")
            input_filename = input_data_inputs_dir.joinpath(f"{filename}.json")
            label_filename = input_data_labels_dir.joinpath(f"{filename}.json")

            with open(ha_filename, "r") as fp:
                abstract_str = fp.read().strip()
                abstract_str = abstract_str.strip().replace("\n", " ")

            with open(input_filename, "r") as fp:
                input_dict = json.load(fp)
                input_str = [
                    sent["text"].strip().remove("\n")
                    for sent in input_dict["inputs"]
                ]

            with open(label_filename, "r") as fp:
                label_dict = json.load(fp)
                label_str = []
Exemplo n.º 20
0
    def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None):
        super(TokenClassificationAccuracy, self).__init__()
        self.idx2labelname_mapping = idx2labelname_mapping
        self.msg_printer = wasabi.Printer()
        self.classification_metrics_utils = ClassificationMetricsUtils(
            idx2labelname_mapping=idx2labelname_mapping)

        self.tp_counter = {}
        self.fp_counter = {}
        self.fn_counter = {}
        self.tn_counter = {}
Exemplo n.º 21
0
    def __init__(
        self,
        train_dataset: Dataset,
        dev_dataset: Dataset = None,
        test_dataset: Dataset = None,
        namespace_vocab_options: Dict[str, Dict[str, Any]] = None,
        namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None,
        batch_size: int = 32,
    ):
        """

        Parameters
        ----------
        train_dataset : Dataset
            A pytorch dataset that represents training data
        dev_dataset : Dataset
            A pytorch dataset that represents validation data
        test_dataset : Dataset
            A pytorch dataset that represents test data
        namespace_vocab_options : Dict[str, Dict[str, Any]]
            For every namespace you can give a set of options that will
            be passed down to Vocab.
        namespace_numericalizer_map: Dict[str, Dict[str, Any]]
            For every namespace, you can give a set of options here that will
            be passed down to the Numericalizer Instances
        batch_size: int
            Batch size for loading the datasets
        """
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.label_namespaces: List[str] = None  # Holds the label namespaces
        self.msg_printer = wasabi.Printer()

        if namespace_vocab_options is None:
            self.namespace_vocab_options = {}
        else:
            self.namespace_vocab_options = namespace_vocab_options

        self.batch_size = batch_size

        self.namespace_to_numericalizer: Dict[
            str, BaseNumericalizer] = namespace_numericalizer_map

        # Build vocab using the datasets passed
        self.namespace_to_vocab: Dict[str, Vocab] = self.build_vocab()

        # sets the vocab for the appropriate numericalizers
        self.namespace_to_numericalizer = self.build_numericalizers()
        self.namespaces = list(self.namespace_to_vocab.keys())
        self.num_labels = {}
        for namespace in self.label_namespaces:
            vocab = self.namespace_to_vocab[namespace]
            self.num_labels[namespace] = vocab.get_vocab_len()
Exemplo n.º 22
0
 def __init__(self):
     self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)
     self.final_model_dir = self.models_cache_dir.joinpath("genericsect_bow_elmo")
     self.model_filepath = self.final_model_dir.joinpath("best_model.pt")
     self.data_dir = pathlib.Path(DATA_DIR)
     self.msg_printer = wasabi.Printer()
     self._download_if_required()
     self.data_manager = self._get_data()
     self.hparams = self._get_hparams()
     self.model = self._get_model()
     self.infer = self._get_infer_client()
Exemplo n.º 23
0
 def __init__(self):
     super(CitationIntentClassification, self).__init__()
     self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)
     self.final_model_dir = self.models_cache_dir.joinpath(
         "citation_intent_clf_elmo", "checkpoints")
     self.data_dir = pathlib.Path(DATA_DIR)
     self.msg_printer = wasabi.Printer()
     self._download_if_required()
     self.hparams = self._get_hparams()
     self.data_manager = self._get_data()
     self.model: nn.Module = self._get_model()
     self.infer = self._get_infer_client()
Exemplo n.º 24
0
    def __init__(
        self,
        filename: str,
        dataset_type: str,
        max_num_words: int,
        max_instance_length: int,
        word_vocab_store_location: str,
        max_char_length: Optional[int] = None,
        char_vocab_store_location: Optional[str] = None,
        captialization_vocab_store_location: Optional[str] = None,
        capitalization_emb_dim: Optional[str] = None,
        debug: bool = False,
        debug_dataset_proportion: float = 0.1,
        word_embedding_type: Union[str, None] = None,
        word_embedding_dimension: Union[int, None] = None,
        char_embedding_dimension: Union[int, None] = None,
        word_start_token: str = "<SOS>",
        word_end_token: str = "<EOS>",
        word_pad_token: str = "<PAD>",
        word_unk_token: str = "<UNK>",
        train_size: float = 0.8,
        test_size: float = 0.2,
        validation_size: float = 0.5,
        word_tokenization_type="vanilla",
        word_add_start_end_token: bool = True,
        max_num_chars: Optional[int] = 10000,
        char_embedding_type: str = "random",
        char_unk_token: str = " ",
        char_pad_token: str = " ",
        char_end_token: str = " ",
        char_start_token: str = " ",
    ):

        self.filename = filename
        self.train_size = train_size
        self.test_size = test_size
        self.validation_size = validation_size
        self.dataset_type = dataset_type
        self.debug = debug
        self.debug_dataset_proportion = debug_dataset_proportion
        self.max_instance_length = max_instance_length

        self.word_add_start_end_token = word_add_start_end_token
        self.classnames2idx = self.get_classname2idx()
        self.instance_preprocessor = None
        self.idx2classname = {
            idx: classname for classname, idx in self.classnames2idx.items()
        }

        self.lines, self.labels = self.get_lines_labels(filename)

        self.msg_printer = wasabi.Printer()
        self.tag_visualizer = VisTagging()
Exemplo n.º 25
0
 def __init__(self):
     super(NeuralParscit, self).__init__()
     self.models_cache_dir = pathlib.Path(MODELS_CACHE_DIR)
     self.final_model_dir = self.models_cache_dir.joinpath(
         "lstm_crf_parscit_final")
     self.model_filepath = self.final_model_dir.joinpath("best_model.pt")
     self.data_dir = pathlib.Path(DATA_DIR)
     self.msg_printer = wasabi.Printer()
     self._download_if_required()
     self.hparams = self._get_hparams()
     self.data_manager = self._get_data()
     self.model: nn.Module = self._get_model()
     self.infer = self._get_infer_client()
     self.vis_tagger = VisTagging()
Exemplo n.º 26
0
    def __init__(
        self,
        model: nn.Module,
        model_filepath: str,
        datasets_manager: DatasetsManager,
        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
        predicted_tags_namespace_prefix: str = "predicted_tags",
    ):
        super(SequenceLabellingInference, self).__init__(
            model=model,
            model_filepath=model_filepath,
            datasets_manager=datasets_manager,
            device=device,
        )

        self.predicted_tags_namespace_prefix = predicted_tags_namespace_prefix
        self.labels_namespaces = self.datasets_manager.label_namespaces
        self.msg_printer = wasabi.Printer()
        self.metrics_calculator = TokenClassificationAccuracy(
            datasets_manager=datasets_manager)

        # The key is the namespace of different labels
        # The value is a dictioary of label->idx
        self.label2idx_mapping: Dict[str, Dict[str, Any]] = {}
        self.idx2label_mapping: Dict[str, Dict[str, Any]] = {}
        for namespace in self.labels_namespaces:
            self.label2idx_mapping[
                namespace] = self.datasets_manager.get_label_idx_mapping(
                    label_namespace=namespace)
            self.idx2label_mapping[
                namespace] = self.datasets_manager.get_idx_label_mapping(
                    label_namespace=namespace)

        self.output_analytics = None
        self.output_df = None
        self.batch_size = 32
        self.load_model()

        self.namespace_to_unique_categories = {}
        self.namespace_to_visualizer = {}
        for namespace in self.labels_namespaces:
            categories = list(
                set([
                    label
                    for label in self.label2idx_mapping[namespace].keys()
                ]))
            visualizer = VisTagging(tags=categories)
            self.namespace_to_unique_categories[namespace] = categories
            self.namespace_to_visualizer[namespace] = visualizer
Exemplo n.º 27
0
    def __init__(self, foldername: str):
        """ Provides an interactive way to move some folders to s3

        Parameters
        ----------
        foldername : str
            The folder name which will be moved to S3 bucket
        """
        self.foldername = foldername
        self.s3_config_json_filename = os.path.join(AWS_CRED_DIR,
                                                    "aws_s3_credentials.json")
        self.s3_util = S3Util(
            aws_cred_config_json_filename=self.s3_config_json_filename)
        self.msg_printer = wasabi.Printer()
        self.interact()
Exemplo n.º 28
0
    def __init__(self, dataset_name: str):
        """

        Parameters
        ----------
        dataset_name : str
            The class name of the dataset that will be generated

        """
        self.dataset_name = dataset_name
        self.template_file = pathlib.Path(
            TEMPLATES_DIR, "classification_dataset_template.txt")
        self.msg_printer = wasabi.Printer()
        self.template = self._get_template()
        self.template_variables = self.interact()
Exemplo n.º 29
0
    def __init__(self, toml_filename: pathlib.Path, infer: bool = False):
        self.toml_filename = toml_filename
        self.infer = infer
        self.msg_printer = wasabi.Printer()
        self.doc = self._parse_toml_file()

        self.experiment_name = None
        self.experiment_dir = None
        # Dict {'train': Dataset, 'valid': Dataset, 'test': Dataset}
        self.all_datasets = None
        self.model_section = None
        self.dataset_section = None
        self.engine_section = None
        self.model = None
        self.engine = None
        self.model_dag = nx.DiGraph()
Exemplo n.º 30
0
    def __init__(self, toml_filename: pathlib.Path, infer: bool = False):
        self.toml_filename = toml_filename
        self.infer = infer
        self.msg_printer = wasabi.Printer()
        self.doc = self._parse_toml_file()
        self.data_dir = pathlib.Path(DATA_DIR)

        self.experiment_name = None
        self.experiment_dir = None
        self.datasets_manager = None
        self.model_section = None
        self.dataset_section = None
        self.engine_section = None
        self.model = None
        self.engine = None
        self.model_dag = nx.DiGraph()