예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Inter-dataset agreement calculator')
    parser.add_argument('gold_directory', help='First data folder path (gold)')
    parser.add_argument('system_directory',
                        help='Second data folder path (system)')
    parser.add_argument('-m',
                        '--mode',
                        default='strict',
                        help='strict or lenient (defaults to strict)')
    parser.add_argument(
        '-f',
        '--format',
        default='plain',
        help=
        'format to print the table (options include grid, github, and latex)')
    parser.add_argument('-d',
                        '--decimal',
                        type=int,
                        default=3,
                        help='number of decimal places to round to')
    args = parser.parse_args()

    gold_dataset = Dataset(args.gold_directory)
    system_dataset = Dataset(args.system_directory)

    result = measure_dataset(gold_dataset, system_dataset, args.mode)
    output = format_results(result,
                            num_dec=args.decimal,
                            table_format=args.format)
    print(output)
예제 #2
0
    def test_fit_predict_dump_load(self):
        """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts"""

        model = Model(self.pipeline)

        # Test attempting to predict before fitting
        with self.assertRaises(RuntimeError):
            model.predict('Lorem ipsum dolor sit amet.')

        model.fit(self.dataset,
                  groundtruth_directory=self.groundtruth_2_directory)
        # Test X and y data are set
        self.assertTrue(model.X_data)
        self.assertTrue(model.y_data)

        # Test that there is at least one prediction
        resulting_ann = model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)

        # Test prediction over directory
        resulting_dataset = model.predict(
            self.dataset.data_directory,
            prediction_directory=self.prediction_directory)
        self.assertIsInstance(resulting_dataset, Dataset)
        self.assertEqual(len(self.dataset), len(resulting_dataset))

        # Test that groundtruth is written
        groundtruth_dataset = Dataset(self.groundtruth_2_directory)
        expected = [d.file_name for d in self.dataset]
        actual = [d.file_name for d in groundtruth_dataset]
        self.assertListEqual(expected, actual)

        # Test that the groundtruth ann files have content
        for ann in groundtruth_dataset.generate_annotations():
            self.assertTrue(ann)

        # Test pickling a model
        pickle_path = os.path.join(self.prediction_directory, 'test.pkl')
        model.dump(pickle_path)
        new_model = Model(self.pipeline)
        new_model.load(pickle_path)

        # Test that there is at least one prediction
        resulting_ann = new_model.predict(
            'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections'
        )
        self.assertIsInstance(resulting_ann, Annotations)
        self.assertTrue(resulting_ann)
예제 #3
0
 def setUpClass(cls):
     cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'),
                           data_limit=1)
     cls.entities = cls.dataset.get_labels(as_list=True)
     cls.prediction_directory = tempfile.mkdtemp(
     )  # Directory to store predictions
     cls.batch_size = 3
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description="Display which annotations in a dataset overlap")
    parser.add_argument("dataset", help="Directory of the dataset")
    args = parser.parse_args()

    dataset = Dataset(args.dataset)
    calculate_dataset_overlap(dataset)
예제 #5
0
 def setUpClass(cls):
     cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'))
     cls.entities = cls.dataset.get_labels(as_list=True)
     cls.prediction_directory = tempfile.mkdtemp(
     )  # directory to store predictions
     cls.prediction_directory_2 = tempfile.mkdtemp()
     cls.pipeline = TestingPipeline(entities=cls.entities)
예제 #6
0
    def test_cross_validate_create_groundtruth_predictions(self):
        """
        Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset
        used by medaCy) is written as well as the predictions that are created for each fold
        """
        model = Model(self.pipeline)
        model.cross_validate(self.dataset,
                             num_folds=2,
                             prediction_directory=self.prediction_directory_3,
                             groundtruth_directory=self.groundtruth_directory)

        prediction_dataset = Dataset(self.prediction_directory_3)
        groundtruth_dataset = Dataset(self.groundtruth_directory)

        for d in [prediction_dataset, groundtruth_dataset]:
            self.assertIsInstance(d, Dataset)

        original_file_names = {d.file_name for d in self.dataset}
        prediction_file_names = {d.file_name for d in prediction_dataset}
        groundtruth_file_names = {d.file_name for d in groundtruth_dataset}

        for n in [prediction_file_names, groundtruth_file_names]:
            self.assertSetEqual(n, original_file_names)

        # Container for all Annotations in all files in all folds
        all_anns_all_folds_actual = Annotations([])

        # Test that fold groundtruth is written to file
        for fold_name in ["fold_1", "fold_2"]:
            fold_dataset = Dataset(groundtruth_dataset.data_directory /
                                   fold_name)
            for d in fold_dataset:
                fold_ann = Annotations(d.ann_path)
                groundtruth_ann = groundtruth_dataset[d.file_name]
                # Test that the entities in the fold groundtruth are a subset of the whole for that file
                self.assertTrue(set(fold_ann) <= set(groundtruth_ann))
                all_anns_all_folds_actual |= fold_ann

        # Container for all annotations pulled directly from the groundtruth dataset
        all_groundtruth_tuples = Annotations([])
        for ann in groundtruth_dataset.generate_annotations():
            all_groundtruth_tuples |= ann

        expected = set(all_groundtruth_tuples)
        actual = set(all_anns_all_folds_actual)
        self.assertSetEqual(expected, actual)
예제 #7
0
def setup(args):
    """
    Sets up dataset and pipeline/model since it gets used by every command.
    :param args: Argparse args object.
    :return dataset, model: The dataset and model objects created.
    """
    dataset = Dataset(args.dataset)
    entities = list(dataset.get_labels())
    if args.test_mode:
        dataset.data_limit = 1

    if args.entities is not None:
        with open(args.entities, 'rb') as f:
            data = json.load(f)
        json_entities = data['entities']
        if not set(json_entities) <= set(entities):
            raise ValueError(
                f"The following entities from the json file are not in the provided dataset: {set(json_entities) - set(entities)}"
            )
        entities = json_entities

    if args.custom_pipeline is not None:
        logging.info(
            f"Using custom pipeline configured at {args.custom_pipeline}")
        # Construct a pipeline class (not an instance) based on the provided json path;
        # args.custom_pipeline is that path
        Pipeline = json_to_pipeline(args.custom_pipeline)
    else:
        # Parse the argument as a class name in module medacy.pipelines
        module = importlib.import_module("medacy.pipelines")
        Pipeline = getattr(module, args.pipeline)
        logging.info('Using %s', args.pipeline)

    pipeline = Pipeline(entities=entities,
                        cuda_device=args.cuda,
                        word_embeddings=args.word_embeddings,
                        batch_size=args.batch_size,
                        learning_rate=args.learning_rate,
                        epochs=args.epochs,
                        pretrained_model=args.pretrained_model,
                        using_crf=args.using_crf)

    model = Model(pipeline)
    return dataset, model
예제 #8
0
    def setUpClass(cls):
        cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'))
        cls.prediction_directory = tempfile.mkdtemp(
        )  # Set up predict directory
        cls.entities = cls.dataset.get_labels(as_list=True)
        cls.ann_files = []

        # Fill directory of prediction files (only the text files)
        for data_file in cls.dataset:
            new_file_path = os.path.join(cls.prediction_directory,
                                         data_file.file_name + '.txt')
            shutil.copyfile(data_file.txt_path, new_file_path)
예제 #9
0
    def setUpClass(cls):
        """Loads sample dataset and sets up a temporary directory for IO tests"""
        cls.test_dir = tempfile.mkdtemp()  # set up temp directory
        cls.sample_data_dir = os.path.join(test_dir, 'sample_dataset_1')
        cls.dataset = Dataset(cls.sample_data_dir)
        cls.entities = cls.dataset.get_labels(as_list=True)

        with open(os.path.join(cls.test_dir, "broken_ann_file.ann"), 'w') as f:
            f.write("This is clearly not a valid ann file")

        cls.ann_path_1 = cls.dataset.data_files[0].ann_path
        cls.ann_path_2 = cls.dataset.data_files[1].ann_path
예제 #10
0
파일: __main__.py 프로젝트: zazabar/medaCy
def setup(args):
    """
    Sets up dataset and pipeline/model since it gets used by every command.

    :param args: Argparse args object.
    :return dataset, model: The dataset and model objects created.
    """
    dataset = Dataset(args.dataset)
    entities = list(dataset.get_labels())

    pipeline = None

    if args.pipeline == 'spacy':
        logging.info('Using spacy model')
        model = SpacyModel(spacy_model_name=args.spacy_model, cuda=args.cuda)
    elif args.custom_pipeline is not None:
        # Construct a pipeline class (not an instance) based on the provided json path;
        # args.custom_pipeline is that path
        Pipeline = json_to_pipeline(args.custom_pipeline)
        # All parameters are part of the class, thus nothing needs to be set when instantiating
        pipeline = Pipeline()
        model = Model(pipeline)
    else:
        # Parse the argument as a class name in module medacy.pipelines
        module = importlib.import_module("medacy.pipelines")
        Pipeline = getattr(module, args.pipeline)
        logging.info('Using %s', args.pipeline)

        pipeline = Pipeline(entities=entities,
                            cuda_device=args.cuda,
                            word_embeddings=args.word_embeddings,
                            batch_size=args.batch_size,
                            learning_rate=args.learning_rate,
                            epochs=args.epochs,
                            pretrained_model=args.pretrained_model,
                            using_crf=args.using_crf)

        model = Model(pipeline)

    return dataset, model
예제 #11
0
    def predict_directory(self, data_directory, prediction_directory):
        """
        Predicts over all txt files in a directory using every Model. Note that this method spends a lot of time
        on file IO because each txt file is opened as many times as there are models.
        :param data_directory: Path to a directory of text files to predict over
        :param prediction_directory: a directory to write predictions to
        :return: a Dataset of the predictions
        """
        if not os.path.isdir(data_directory):
            raise ValueError(
                f"'data_directory' must be an existing directory, but is '{repr(data_directory)}'"
            )
        if not os.path.isdir(prediction_directory):
            raise ValueError(
                f"'prediction_directory' must be a directory, but is '{repr(prediction_directory)}'"
            )

        # Get all the txt files in the input directory
        txt_files = [
            f for f in os.listdir(data_directory) if f.endswith('.txt')
        ]
        # Create a dictionary of empty Annotations objects to store the predictions
        annotation_dict = {
            f: Annotations([], source_text_path=f)
            for f in txt_files
        }

        for model in self:
            for file_name in txt_files:
                file_path = os.path.join(data_directory, file_name)
                with open(file_path) as f:
                    text = f.read()
                this_annotations = annotation_dict[file_name]
                resulting_annotations = model.predict(text)
                # Merge the two Annotations together and store them back in the dictionary
                annotation_dict[
                    file_name] = this_annotations | resulting_annotations

        # Create the new Dataset directory
        for path, ann in annotation_dict.items():
            # Get the name of the output ann file
            path = os.path.join(data_directory, path)
            base_name = os.path.basename(path)[:-4]
            output_ann = os.path.join(prediction_directory, base_name + '.ann')
            output_txt = os.path.join(prediction_directory, base_name + '.txt')

            # Write the ann file
            ann.to_ann(output_ann)
            # Copy the txt file
            copyfile(path, output_txt)

        return Dataset(prediction_directory)
예제 #12
0
def main():
    parser = argparse.ArgumentParser(
        description="Calculate the lexical variation in a given dataset")
    parser.add_argument('dataset', help="Path to the dataset directory")
    parser.add_argument(
        '-f',
        '--format',
        help=
        "Format to print the table (options include grid, github, and latex)")
    args = parser.parse_args()

    data = Dataset(args.dataset)
    unique_mention_dict = calculate_unique_mentions(data)
    tag_counts = data.compute_counts()

    table = [['Tag', 'Unique Mentions', 'Total Mentions', 'Ratio']]
    for tag, mentions in unique_mention_dict.items():
        table.append([
            tag,
            len(mentions), tag_counts[tag],
            len(mentions) / tag_counts[tag]
        ])

    print(tabulate.tabulate(table, headers="firstrow", tablefmt=args.format))
예제 #13
0
    def setUpClass(cls) -> None:
        if not have_metamap:
            return
        cls.metamap = MetaMap(metamap_path)
        cls.metamap.activate()

        # Create an unmetamapped copy of the sample dataset
        cls.temp_dataset_dir = tempfile.mkdtemp()
        for df in sample_dataset:
            shutil.copyfile(
                df.txt_path,
                os.path.join(cls.temp_dataset_dir, df.file_name + '.txt'))
            shutil.copyfile(
                df.ann_path,
                os.path.join(cls.temp_dataset_dir, df.file_name + '.ann'))

        cls.dataset = Dataset(cls.temp_dataset_dir)
예제 #14
0
    def predict_annotation_evaluation(self, directory, training_dataset,
                                      preds_by_document,
                                      groundtruth_by_document, option):
        for data_file in training_dataset:
            logging.info("Predicting %s file: %s", option, data_file.file_name)
            with open(data_file.txt_path, 'r') as f:
                doc = self.pipeline.spacy_pipeline.make_doc(f.read())

            if option == "groundtruth":
                preds = groundtruth_by_document[data_file.file_name]
            else:
                preds = preds_by_document[data_file.file_name]

            annotations = construct_annotations_from_tuples(doc, preds)
            annotations.to_ann(
                write_location=os.path.join(directory, data_file.file_name +
                                            ".ann"))

        return Dataset(directory)
예제 #15
0
    def test_multi_model(self):
        """Runs all tests for valid uses of MultiModel"""

        data = Dataset(self.data_dir)
        ents_1 = {'Endpoints', 'Species', 'DoseUnits'}
        ents_2 = {'TestArticle', 'Dose', 'Sex'}

        multimodel = MultiModel()
        # Test that *args works
        multimodel.add_model(self.sample_model_1_path, ClinicalPipeline,
                             list(ents_1))
        # Test that **kwargs works
        multimodel.add_model(self.sample_model_2_path,
                             TestingPipeline,
                             entities=list(ents_2))

        # Test __len__
        self.assertEqual(len(multimodel), 2)

        # Test that each model gets instantiated correctly
        for model, pipeline_class in zip(multimodel,
                                         [ClinicalPipeline, TestingPipeline]):
            current_pipeline = model.pipeline
            self.assertIsInstance(current_pipeline, pipeline_class)
            self.assertGreater(len(current_pipeline.entities), 0)

        # Test predict_directory
        resulting_data = multimodel.predict_directory(data.data_directory,
                                                      self.temp_dir)
        labeled_items = resulting_data.get_labels()

        # Test that at least one label from each model is predicted
        self.assertTrue(any(e in ents_1 for e in labeled_items))
        self.assertTrue(any(e in ents_2 for e in labeled_items))

        # Test that all files get predicted for
        self.assertEqual(len(resulting_data), len(data))
예제 #16
0
 def test_init_with_data_limit(self):
     """Tests that initializing with a data limit works"""
     dataset = Dataset(self.dataset.data_directory, data_limit=1)
     self.assertEqual(len(dataset), 1)
예제 #17
0
    def cross_validate(self,
                       training_dataset=None,
                       num_folds=5,
                       prediction_directory=None,
                       groundtruth_directory=None,
                       asynchronous=False):
        """
        Performs k-fold stratified cross-validation using our model and pipeline.

        If the training dataset, groundtruth_directory and prediction_directory are passed, intermediate predictions during cross validation
        are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute
        the prediction ambiguity with the methods present in the Dataset class to support pipeline development without
        a designated evaluation set.

        :param training_dataset: Dataset that is being cross validated (optional)
        :param num_folds: number of folds to split training data into for cross validation
        :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory.
        :param groundtruth_directory: directory to write the ground truth MedaCy evaluates on
        :param asynchronous: Boolean for whether the preprocessing should be done asynchronously.
        :return: Prints out performance metrics, if prediction_directory
        """

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1, but is %s"
                % repr(num_folds))

        if prediction_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generate predictions during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )
        if groundtruth_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generate groundtruth during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )

        pipeline_report = self.pipeline.get_report()

        self.preprocess(training_dataset, asynchronous)

        if not (self.X_data and self.y_data):
            raise RuntimeError(
                "Must have features and labels extracted for cross validation")

        tags = sorted(training_dataset.get_labels(as_list=True))
        self.pipeline.entities = tags
        logging.info('Tagset: %s', tags)

        eval_stats = {}

        # Dict for storing mapping of sequences to their corresponding file
        groundtruth_by_document = {
            filename: []
            for filename in {x[2]
                             for x in self.X_data}
        }
        preds_by_document = {
            filename: []
            for filename in {x[2]
                             for x in self.X_data}
        }

        folds = create_folds(self.y_data, num_folds)

        for fold_num, fold_data in enumerate(folds, 1):
            train_indices, test_indices = fold_data
            fold_statistics = {}
            learner_name, learner = self.pipeline.get_learner()

            X_train = [self.X_data[index] for index in train_indices]
            y_train = [self.y_data[index] for index in train_indices]

            X_test = [self.X_data[index] for index in test_indices]
            y_test = [self.y_data[index] for index in test_indices]

            logging.info("Training Fold %i", fold_num)
            train_data = [x[0] for x in X_train]
            test_data = [x[0] for x in X_test]
            learner.fit(train_data, y_train)
            y_pred = learner.predict(test_data)

            if groundtruth_directory is not None:
                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []
                for sequence in X_test:
                    document_indices += [sequence[2]] * len(sequence[0])
                    span_indices += list(sequence[1])
                groundtruth = [
                    element for sentence in y_test for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0

                while i < len(groundtruth):
                    if groundtruth[i] == 'O':
                        i += 1
                        continue

                    entity = groundtruth[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]
                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(groundtruth) - 1 and groundtruth[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1

                    last_start, last_end = span_indices[i]
                    groundtruth_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            if prediction_directory is not None:
                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []

                for sequence in X_test:
                    document_indices += [sequence[2]] * len(sequence[0])
                    span_indices += list(sequence[1])

                predictions = [
                    element for sentence in y_pred for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0

                while i < len(predictions):
                    if predictions[i] == 'O':
                        i += 1
                        continue

                    entity = predictions[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]

                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(predictions) - 1 and predictions[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1

                    last_start, last_end = span_indices[i]
                    preds_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            # Write the metrics for this fold.
            for label in tags:
                fold_statistics[label] = {
                    "recall":
                    metrics.flat_recall_score(y_test,
                                              y_pred,
                                              average='weighted',
                                              labels=[label]),
                    "precision":
                    metrics.flat_precision_score(y_test,
                                                 y_pred,
                                                 average='weighted',
                                                 labels=[label]),
                    "f1":
                    metrics.flat_f1_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=[label])
                }

            # add averages
            fold_statistics['system'] = {
                "recall":
                metrics.flat_recall_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=tags),
                "precision":
                metrics.flat_precision_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=tags),
                "f1":
                metrics.flat_f1_score(y_test,
                                      y_pred,
                                      average='weighted',
                                      labels=tags)
            }

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in tags + ['system']]

            logging.info(
                '\n' +
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            eval_stats[fold_num] = fold_statistics

        statistics_all_folds = {}

        for label in tags + ['system']:
            statistics_all_folds[label] = {
                'precision_average':
                mean(eval_stats[fold][label]['precision']
                     for fold in eval_stats),
                'precision_max':
                max(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'precision_min':
                min(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'recall_average':
                mean(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'recall_max':
                max(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'f1_average':
                mean(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_max':
                max(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_min':
                min(eval_stats[fold][label]['f1'] for fold in eval_stats),
            }

        entity_counts = training_dataset.compute_counts()

        table_data = [
            [
                f"{label} ({entity_counts[label]})",  # Entity (Count)
                format(statistics_all_folds[label]['precision_average'],
                       ".3f"),
                format(statistics_all_folds[label]['recall_average'], ".3f"),
                format(statistics_all_folds[label]['f1_average'], ".3f"),
                format(statistics_all_folds[label]['f1_min'], ".3f"),
                format(statistics_all_folds[label]['f1_max'], ".3f")
            ] for label in tags + ['system']
        ]

        # Combine the pipeline report and the resulting data, then log it or print it (whichever ensures that it prints)

        output_str = '\n' + pipeline_report + '\n\n' + tabulate(
            table_data,
            headers=[
                'Entity (Count)', 'Precision', 'Recall', 'F1', 'F1_Min',
                'F1_Max'
            ],
            tablefmt='orgtbl')

        if logging.root.level > logging.INFO:
            print(output_str)
        else:
            logging.info(output_str)

        if prediction_directory:

            prediction_directory = os.path.join(
                training_dataset.data_directory, "predictions")
            groundtruth_directory = os.path.join(
                training_dataset.data_directory, "groundtruth")

            # Write annotations generated from cross-validation
            self.create_annotation_directory(directory=prediction_directory,
                                             training_dataset=training_dataset,
                                             option="predictions")

            # Write medaCy ground truth generated from cross-validation
            self.create_annotation_directory(directory=groundtruth_directory,
                                             training_dataset=training_dataset,
                                             option="groundtruth")

            # Add predicted/known annotations to the folders containing groundtruth and predictions respectively
            self.predict_annotation_evaluation(
                directory=groundtruth_directory,
                training_dataset=training_dataset,
                preds_by_document=preds_by_document,
                groundtruth_by_document=groundtruth_by_document,
                option="groundtruth")

            self.predict_annotation_evaluation(
                directory=prediction_directory,
                training_dataset=training_dataset,
                preds_by_document=preds_by_document,
                groundtruth_by_document=groundtruth_by_document,
                option="predictions")

            return Dataset(prediction_directory)
        else:
            return statistics_all_folds
예제 #18
0
    def predict(self, input_data, prediction_directory=None):
        """
        Generates predictions over a string or a input_data utilizing the pipeline equipped to the instance.

        :param input_data: a string, Dataset, or directory path to predict over
        :param prediction_directory: The directory to write predictions if doing bulk prediction
            (default: */prediction* sub-directory of Dataset)
        :return: if input_data is a str, returns an Annotations of the predictions;
            if input_data is a Dataset or a valid directory path, returns a Dataset of the predictions.

        Note that if input_data is supposed to be a directory path but the directory is not found, it will be predicted
        over as a string. This can be prevented by validating inputs with os.path.isdir().
        """

        if self.model is None:
            raise RuntimeError(
                "Must fit or load a pickled model before predicting")

        if isinstance(input_data, str) and not os.path.isdir(input_data):
            doc = self.pipeline.spacy_pipeline.make_doc(input_data)
            doc.set_extension('file_name', default=None, force=True)
            doc._.file_name = 'STRING_INPUT'
            doc = self.pipeline(doc, predict=True)
            annotations = predict_document(self.model, doc, self.pipeline)
            return annotations

        if isinstance(input_data, Dataset):
            input_files = [d.txt_path for d in input_data]
            # Change input_data to point to the Dataset's directory path so that we can use it
            # to create the prediction directory
            input_data = input_data.data_directory
        elif os.path.isdir(input_data):
            input_files = [
                os.path.join(input_data, f) for f in os.listdir(input_data)
                if f.endswith('.txt')
            ]
        else:
            raise ValueError(
                f"'input_data' must be a string (which can be a directory path) or a Dataset, but is {repr(input_data)}"
            )

        if prediction_directory is None:
            prediction_directory = os.path.join(input_data, 'predictions')
            if os.path.isdir(prediction_directory):
                logging.warning("Overwriting existing predictions at %s",
                                prediction_directory)
            else:
                os.mkdir(prediction_directory)

        for file_path in input_files:
            file_name = os.path.basename(file_path).strip('.txt')
            logging.info("Predicting file: %s", file_path)

            with open(file_path, 'r') as f:
                doc = self.pipeline.spacy_pipeline.make_doc(f.read())

            doc.set_extension('file_name', default=None, force=True)
            doc._.file_name = file_name

            # run through the pipeline
            doc = self.pipeline(doc, predict=True)

            # Predict, creating a new Annotations object
            annotations = predict_document(self.model, doc, self.pipeline)
            logging.debug(
                "Writing to: %s",
                os.path.join(prediction_directory, file_name + ".ann"))
            annotations.to_ann(
                write_location=os.path.join(prediction_directory, file_name +
                                            ".ann"))

            # Copy the txt file so that the output will also be a Dataset
            copyfile(file_path,
                     os.path.join(prediction_directory, file_name + ".txt"))

        return Dataset(prediction_directory)
예제 #19
0
파일: __init__.py 프로젝트: zazabar/medaCy
import os

from medacy.data.dataset import Dataset

test_dir = os.path.dirname(__file__)
sample_dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'))
예제 #20
0
# Will build a model named model_name with the pipeline and parameters defined below. This script places the model in
# it's own directory along the models build log and model/pipeline parameters to keep results easily referencable during run time.
# Once a sufficent model is produced, consider wrapping it up into a medaCy compatible model as defined the example guide.

import datetime
import logging
import os
import sys
import time

from medacy.data.dataset import Dataset
from medacy.model.model import Model
from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
from medacy.pipelines.systematic_review_pipeline import SystematicReviewPipeline

train_dataset, evaluation_dataset = Dataset.load_external(
    'medacy_dataset_tac_2018')
entities = train_dataset.get_labels(as_list=True)

if sys.argv[1] is None:
    exit(0)

# For rapid model prototyping, will train and predict by simply running the script with a model name as a parameter.
model_name = sys.argv[1]  # name for the model, use underscores
model_notes = "notes about the current model"  # notes about current model to be stored in a model information file by this script.

model_directory = "/home/username/named_entity_recognition/challenges/challenge_n/models/%s" % model_name.replace(
    " ", '_')

if model_name == "" or os.path.isdir(model_directory):
    print("Model directory already exists, aborting")
    exit(0)
예제 #21
0

import argparse
import glob
import os
from collections import defaultdict
from xml.etree import cElementTree
from medacy.data.dataset import Dataset

# Setup
parser = argparse.ArgumentParser(description='n2c2: Evaluation script for Track 2')
parser.add_argument('folder1', help='First data folder path (gold)')
parser.add_argument('folder2', help='Second data folder path (system)')
args = parser.parse_args()

gold_dataset = Dataset(args.folder1)
prediction_dataset = Dataset(args.folder2)
global_tags = tuple(gold_dataset.get_labels() & prediction_dataset.get_labels())


class ClinicalCriteria(object):
    """Criteria in the Track 1 documents."""

    def __init__(self, tid, value):
        """Init."""
        self.tid = tid.strip().upper()
        self.ttype = self.tid
        self.value = value.lower().strip()

    def equals(self, other, mode='strict'):
        """Return whether the current criteria is equal to the one provided."""
예제 #22
0
 def setUpClass(cls) -> None:
     cls.gold_dataset = sample_dataset
     cls.predicted_dataset = Dataset(
         str(sample_dataset.data_directory) + "_predictions")
     cls.maxDiff = None
예제 #23
0
 def test_init_prediction(self):
     """Tests that the copy of the sample dataset with only text files is identified as being for prediction"""
     dataset = Dataset(self.prediction_directory)
     self.assertIsInstance(dataset, Dataset)
     self.assertFalse(dataset.is_training_directory)
예제 #24
0
    def test_init(self):
        """Tests initializing Datasets from different directories to see that they create accurate DataFiles"""

        # Test both txt, ann, and metamapped
        test_dir_path = Path(self.dataset.data_directory)
        expected = [
            DataFile(file_name="PMC1257590",
                     txt_path=test_dir_path / "PMC1257590.txt",
                     ann_path=test_dir_path / "PMC1257590.ann",
                     metamapped_path=test_dir_path / "metamapped" /
                     "PMC1257590.metamapped"),
            DataFile(file_name="PMC1314908",
                     txt_path=test_dir_path / "PMC1314908.txt",
                     ann_path=test_dir_path / "PMC1314908.ann",
                     metamapped_path=test_dir_path / "metamapped" /
                     "PMC1314908.metamapped"),
            DataFile(file_name="PMC1392236",
                     txt_path=test_dir_path / "PMC1392236.txt",
                     ann_path=test_dir_path / "PMC1392236.ann",
                     metamapped_path=test_dir_path / "metamapped" /
                     "PMC1392236.metamapped")
        ]
        expected.sort(key=lambda x: x.file_name)
        actual = list(self.dataset)
        self.assertListEqual(actual, expected)

        # Test txt only
        test_dir_path = Path(self.prediction_directory)
        expected = [
            DataFile(file_name="PMC1257590",
                     txt_path=test_dir_path / "PMC1257590.txt",
                     ann_path=None,
                     metamapped_path=None),
            DataFile(file_name="PMC1314908",
                     txt_path=test_dir_path / "PMC1314908.txt",
                     ann_path=None,
                     metamapped_path=None),
            DataFile(file_name="PMC1392236",
                     txt_path=test_dir_path / "PMC1392236.txt",
                     ann_path=None,
                     metamapped_path=None)
        ]
        expected.sort(key=lambda x: x.file_name)
        actual = list(Dataset(self.prediction_directory))
        self.assertListEqual(actual, expected)

        # Test ann only
        test_dir_path = Path(self.ann_dir)
        expected = [
            DataFile(file_name="PMC1257590",
                     txt_path=None,
                     ann_path=test_dir_path / "PMC1257590.ann",
                     metamapped_path=None),
            DataFile(
                file_name="PMC1314908",
                txt_path=None,
                ann_path=test_dir_path / "PMC1314908.ann",
                metamapped_path=None,
            ),
            DataFile(file_name="PMC1392236",
                     txt_path=None,
                     ann_path=test_dir_path / "PMC1392236.ann",
                     metamapped_path=None)
        ]
        expected.sort(key=lambda x: x.file_name)
        actual = list(Dataset(self.ann_dir))
        self.assertListEqual(actual, expected)