def test_has_all_types(self):
        """
        Tests that all tokens are given every semantic type label found in the dataset; this ensures that
        when a new semantic type is found, old Docs retroactively receive the corresponding annotation
        """
        data = sample_dataset
        ents = data.get_labels(as_list=True)
        pipe = self.Pipeline(ents=ents)
        model = Model(pipe)

        all_sem_types = set()
        for mm_file in [d.metamapped_path for d in data]:
            with open(mm_file) as f:
                text = f.read()
            all_sem_types |= set(
                re.findall("(?<=\"SemType\": \")[a-z]+(?=\")", text))

        # Strictly speaking, this test is only looking at the target words rather than all words in the window
        type_labels = [
            '0:feature_is_' + sem_type for sem_type in all_sem_types
        ]

        model.preprocess(data)

        for entry in model.X_data:
            for entry_dict in entry[0]:
                for label in type_labels:
                    self.assertIn(label, entry_dict.keys())
def load():
    entities = [
        'Drug', 'Form', 'Route', 'ADE', 'Reason', 'Frequency', 'Duration',
        'Dosage', 'Strength'
    ]
    pipeline = N2C2Pipeline(entities=entities)
    model = Model(pipeline)
    model_directory = resource_filename('medacy_model_clinical_notes', 'model')
    model.load(os.path.join(model_directory, 'n2c2_2020_jan_22.pkl'))
    return model
def load():
    entities = ['Drug', 'Form', 'Route', 'ADE', 'Reason', 'Frequency', 'Duration', 'Dosage', 'Strength']
    pipeline = BertPipeline(
        entities=entities, using_crf=True,
        pretrained_model='emilyalsentzer/Bio_ClinicalBERT',
        **PIPELINE_ARGS)
    model = Model(pipeline)
    model_directory = resource_filename('medacy_bert_model_clinical_notes', 'model')
    model_directory = os.path.join(model_directory, 'torch')
    model.load(model_directory)
    return model
Пример #4
0
def _activate_model(model_path, pipeline_class, args, kwargs):
    """
    Creates a Model with the given pipeline configuration and sets its weights to the pickled model path
    :param model_path: path to the model pickle file
    :param pipeline_class: the pipeline class for the pickled model
    :param args, kwargs: arguments to pass to the pipeline constructor
    :return: a usable Model instance
    """
    pipeline_instance = pipeline_class(*args, **kwargs)
    model = Model(pipeline_instance)
    model.load(model_path)
    return model
Пример #5
0
    def test_cross_validate(self):
        """Ensures that changes made in the package do not prevent cross_validate from running to completion"""
        model = Model(self.pipeline)

        # Test that invalid fold counts raise ValueError
        for num in [-1, 0, 1]:
            with self.assertRaises(ValueError):
                model.cross_validate(self.dataset, num)

        try:
            resulting_data = model.cross_validate(self.dataset, 2)
            # Checking the log can help verify that the results of cross validation are expectable
            logging.debug(resulting_data)
        except:
            self.assertTrue(False)
Пример #6
0
    def test_prediction_with_testing_pipeline(self):
        """Tests that a model created with the BiLSTM+CRF can be fitted and used to predict"""
        pipeline = LstmSystematicReviewPipeline(
            entities=self.entities,
            word_embeddings=word_embeddings,
            cuda_device=cuda_device)

        model = Model(pipeline)
        model.fit(self.dataset)
        resulting_dataset = model.predict(
            self.dataset, prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        # Test that there is at least one prediction
        if not any(resulting_dataset.generate_annotations()):
            warn("The model did not generate any predictions")
Пример #7
0
def main(model: str, text: str = None, cuda: int = None):
    if text is None:
        text = "The patient was prescribed 1 capsule of Advil for five days."
    if model == 'crf':
        print('using CRF model')
        model = Model.load_external('medacy_model_clinical_notes')
    else:
        print('using BERT model')
        if cuda is not None:
            from medacy_bert_model_clinical_notes import medacy_bert_model_clinical_notes
            medacy_bert_model_clinical_notes.PIPELINE_ARGS[
                'cuda_device'] = cuda
        model = Model.load_external('medacy_bert_model_clinical_notes')
    anons = model.predict(text)
    for anon in anons:
        print(anon)
Пример #8
0
    def test_run_through_pipeline(self):
        """
        Tests that this function runs a document through the pipeline by testing that it has attributes
        overlayed by the pipeline
        """
        model = Model(self.pipeline)
        sample_df = list(self.dataset)[0]
        result = model._run_through_pipeline(sample_df)

        expected = sample_df.txt_path
        actual = result._.file_name
        self.assertEqual(actual, expected)

        expected = sample_df.ann_path
        actual = result._.gold_annotation_file
        self.assertEqual(actual, expected)
Пример #9
0
    def test_cross_validate_create_groundtruth_predictions(self):
        """
        Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset
        used by medaCy) is written as well as the predictions that are created for each fold
        """
        model = Model(self.pipeline)
        model.cross_validate(self.dataset,
                             num_folds=2,
                             prediction_directory=self.prediction_directory_3,
                             groundtruth_directory=self.groundtruth_directory)

        prediction_dataset = Dataset(self.prediction_directory_3)
        groundtruth_dataset = Dataset(self.groundtruth_directory)

        for d in [prediction_dataset, groundtruth_dataset]:
            self.assertIsInstance(d, Dataset)

        original_file_names = {d.file_name for d in self.dataset}
        prediction_file_names = {d.file_name for d in prediction_dataset}
        groundtruth_file_names = {d.file_name for d in groundtruth_dataset}

        for n in [prediction_file_names, groundtruth_file_names]:
            self.assertSetEqual(n, original_file_names)

        # Container for all Annotations in all files in all folds
        all_anns_all_folds_actual = Annotations([])

        # Test that fold groundtruth is written to file
        for fold_name in ["fold_1", "fold_2"]:
            fold_dataset = Dataset(groundtruth_dataset.data_directory /
                                   fold_name)
            for d in fold_dataset:
                fold_ann = Annotations(d.ann_path)
                groundtruth_ann = groundtruth_dataset[d.file_name]
                # Test that the entities in the fold groundtruth are a subset of the whole for that file
                self.assertTrue(set(fold_ann) <= set(groundtruth_ann))
                all_anns_all_folds_actual |= fold_ann

        # Container for all annotations pulled directly from the groundtruth dataset
        all_groundtruth_tuples = Annotations([])
        for ann in groundtruth_dataset.generate_annotations():
            all_groundtruth_tuples |= ann

        expected = set(all_groundtruth_tuples)
        actual = set(all_anns_all_folds_actual)
        self.assertSetEqual(expected, actual)
Пример #10
0
def setup(args):
    """
    Sets up dataset and pipeline/model since it gets used by every command.

    :param args: Argparse args object.
    :return dataset, model: The dataset and model objects created.
    """
    dataset = Dataset(args.dataset)
    entities = list(dataset.get_labels())

    pipeline = None

    if args.pipeline == 'spacy':
        logging.info('Using spacy model')
        model = SpacyModel(spacy_model_name=args.spacy_model, cuda=args.cuda)
    elif args.custom_pipeline is not None:
        # Construct a pipeline class (not an instance) based on the provided json path;
        # args.custom_pipeline is that path
        Pipeline = json_to_pipeline(args.custom_pipeline)
        # All parameters are part of the class, thus nothing needs to be set when instantiating
        pipeline = Pipeline()
        model = Model(pipeline)
    else:
        # Parse the argument as a class name in module medacy.pipelines
        module = importlib.import_module("medacy.pipelines")
        Pipeline = getattr(module, args.pipeline)
        logging.info('Using %s', args.pipeline)

        pipeline = Pipeline(entities=entities,
                            cuda_device=args.cuda,
                            word_embeddings=args.word_embeddings,
                            batch_size=args.batch_size,
                            learning_rate=args.learning_rate,
                            epochs=args.epochs,
                            pretrained_model=args.pretrained_model,
                            using_crf=args.using_crf)

        model = Model(pipeline)

    return dataset, model
Пример #11
0
    def test_predict(self):
        """
        predict() has different functionality depending on what is passed to it; therefore this test
        ensures that each type of input is handled correctly
        """

        # Init the Model
        pipe = TestingPipeline(entities=self.entities)
        sample_model_path = os.path.join(test_dir, 'sample_models',
                                         'sample_test_pipe.pkl')
        model = Model(pipe)
        model.load(sample_model_path)

        # Test passing a Dataset
        dataset_output = model.predict(self.dataset)
        self.assertIsInstance(dataset_output, Dataset)
        self.assertEqual(len(dataset_output), len(self.dataset))

        # Test passing a directory
        directory_output = model.predict(self.dataset.data_directory)
        self.assertIsInstance(directory_output, Dataset)
        self.assertEqual(len(directory_output), len(self.dataset))

        # Test passing a string
        string_output = model.predict('This is a sample string.')
        self.assertIsInstance(string_output, Annotations)

        # Test that the predictions are written to the expected location when no path is provided
        expected_dir = os.path.join(self.dataset.data_directory, 'predictions')
        self.assertTrue(os.path.isdir(expected_dir))

        # Delete that directory
        shutil.rmtree(expected_dir)

        # Test predicting to a specific directory
        model.predict(self.dataset.data_directory,
                      prediction_directory=self.prediction_directory_2)
        expected_files = os.listdir(self.prediction_directory_2)
        self.assertEqual(6, len(expected_files))
Пример #12
0
def setup(args):
    """
    Sets up dataset and pipeline/model since it gets used by every command.
    :param args: Argparse args object.
    :return dataset, model: The dataset and model objects created.
    """
    dataset = Dataset(args.dataset)
    entities = list(dataset.get_labels())
    if args.test_mode:
        dataset.data_limit = 1

    if args.entities is not None:
        with open(args.entities, 'rb') as f:
            data = json.load(f)
        json_entities = data['entities']
        if not set(json_entities) <= set(entities):
            raise ValueError(
                f"The following entities from the json file are not in the provided dataset: {set(json_entities) - set(entities)}"
            )
        entities = json_entities

    if args.custom_pipeline is not None:
        logging.info(
            f"Using custom pipeline configured at {args.custom_pipeline}")
        # Construct a pipeline class (not an instance) based on the provided json path;
        # args.custom_pipeline is that path
        Pipeline = json_to_pipeline(args.custom_pipeline)
    else:
        # Parse the argument as a class name in module medacy.pipelines
        module = importlib.import_module("medacy.pipelines")
        Pipeline = getattr(module, args.pipeline)
        logging.info('Using %s', args.pipeline)

    pipeline = Pipeline(entities=entities,
                        cuda_device=args.cuda,
                        word_embeddings=args.word_embeddings,
                        batch_size=args.batch_size,
                        learning_rate=args.learning_rate,
                        epochs=args.epochs,
                        pretrained_model=args.pretrained_model,
                        using_crf=args.using_crf)

    model = Model(pipeline)
    return dataset, model
Пример #13
0
    def test_cross_validate_fit_predict(self):
        """Tests that a model created with BERT can be fitted and used to predict, with and without the CRF layer"""
        pipeline = BertPipeline(entities=self.entities,
                                pretrained_model='bert-base-cased',
                                batch_size=self.batch_size,
                                cuda_device=cuda_device)

        pipeline_crf = BertPipeline(entities=self.entities,
                                    pretrained_model='bert-base-cased',
                                    batch_size=self.batch_size,
                                    cuda_device=cuda_device,
                                    using_crf=True)

        for pipe in [pipeline, pipeline_crf]:
            model = Model(pipe)
            model.cross_validate(self.dataset, 2)
            model.fit(self.dataset)
            resulting_dataset = model.predict(
                self.dataset, prediction_directory=self.prediction_directory)
            self.assertIsInstance(resulting_dataset, Dataset)
            # Test that there is at least one prediction
            if not any(resulting_dataset.generate_annotations()):
                warn("The model did not generate any predictions")
Пример #14
0
    def test_fit_predict_dump_load(self):
        """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts"""

        model = Model(self.pipeline)

        # Test attempting to predict before fitting
        with self.assertRaises(RuntimeError):
            model.predict('Lorem ipsum dolor sit amet.')

        model.fit(self.dataset,
                  groundtruth_directory=self.groundtruth_2_directory)
        # Test X and y data are set
        self.assertTrue(model.X_data)
        self.assertTrue(model.y_data)

        # Test that there is at least one prediction
        resulting_ann = model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)

        # Test prediction over directory
        resulting_dataset = model.predict(
            self.dataset.data_directory,
            prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        self.assertEqual(len(self.dataset), len(resulting_dataset))

        # Test that groundtruth is written
        groundtruth_dataset = Dataset(self.groundtruth_2_directory)
        expected = [d.file_name for d in self.dataset]
        actual = [d.file_name for d in groundtruth_dataset]
        self.assertListEqual(expected, actual)

        # Test that the groundtruth ann files have content
        for ann in groundtruth_dataset.generate_annotations():
            self.assertTrue(ann)

        # Test pickling a model
        pickle_path = os.path.join(self.prediction_directory, 'test.pkl')
        model.dump(pickle_path)
        new_model = Model(self.pipeline)
        new_model.load(pickle_path)

        # Test that there is at least one prediction
        resulting_ann = new_model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)
df_filtered['tweet_text'] = df_filtered['tweet_text'].apply(
    lambda x: remove_emoji(x))

#Getting the annotations from each of the tweet's text
#Part of the following code was obtained from here:
#https://github.com/NLPatVCU/medaCy/

from medacy.model.model import Model

df_medacy_annotations = pd.DataFrame(columns=[
    'Tweet_id', 'Text_section', 'Span_start', 'Span_end', 'Annotation_type',
    'Extras'
])
df_medacy_tweets_tagged = pd.DataFrame(columns=['Tweet_id', 'Tweet_full_text'])

model = Model.load_external('medacy_model_clinical_notes')

print("Configuring the Medacy tagger. Please wait...")
for index, row in df_filtered.iterrows():

    annotation = model.predict(row['tweet_text'])

    if len(annotation) > 0:
        df_medacy_tweets_tagged.loc[len(df_medacy_tweets_tagged.index)] = [
            row['tweet_id'], row['tweet_text']
        ]

        for i in range(len(annotation)):
            df_medacy_annotations.loc[len(df_medacy_annotations)] = [
                row['tweet_id'], annotation.annotations[i][3],
                annotation.annotations[i][1], annotation.annotations[i][2],
Пример #16
0
    time.time()).strftime('%Y_%m_%d_%H.%M.%S')
logging.basicConfig(filename=model_directory + '/build_%s.log' % current_time,
                    level=logging.DEBUG)

# Initialize everything needed for model

# Metamaps the dataset, if it not already, and stores the metamapped files for access in training_dataset.
# See Dataset API for details.
metamap = MetaMap("/home/share/programs/metamap/2016/public_mm/bin/metamap")
with metamap:
    train_dataset.metamap(metamap, n_jobs=3)
    evaluation_dataset.metamap(metamap, n_jobs=3)

# Selects the pre-processing pipeline this model should be trained with respect to.
pipeline = SystematicReviewPipeline(entities=entities, use_metamap=True)
model = Model(pipeline, n_jobs=1)
# number of cores to utilize during feature extraction when training the model.
# Note: this is done by forking, not threading hence utlizes a large amount of memory.

# Write information about model before training
with open(model_directory + "/model_information.txt", 'w') as model_info:
    model_info.write("Entities: [%s]\n" % ", ".join(entities))
    model_info.write("Training Files: %i\n" %
                     len(train_dataset.get_data_files()))
    model_info.write(model_notes + "\n")
    model_info.write(str(model))

model.fit(train_dataset)

# dump fitted model
current_time = datetime.datetime.fromtimestamp(