def setup(args): dataset = Dataset(args.dataset) pipeline = None if args.pipeline == 'spacy': model = SpacyModel return dataset, model else: labels = list(dataset.get_labels()) pipeline_arg = args.pipeline #Parse the argument as a class name in module medacy.ner.pipelines module = importlib.import_module("medacy.ner.pipelines") pipeline_class = getattr(module, pipeline_arg) if args.word_embeddings is not None: pipeline = pipeline_class(entities=labels, word_embeddings=args.word_embeddings) else: pipeline = pipeline_class(entities=labels) model = Model(pipeline) return dataset, model
def test_init_training(self): """ Tests initialization of DataManager :return: """ dataset = Dataset(self.training_directory) self.assertIsInstance(dataset, Dataset) self.assertTrue(dataset.is_training())
def test_init_with_data_limit(self): """ Tests initialization of DataManager :return: """ dataset = Dataset(self.training_directory, data_limit=6) self.assertEqual(len(dataset.get_data_files()), 6)
def test_init_prediction(self): """ Tests initialization of DataManager :return: """ dataset = Dataset(self.prediction_directory) self.assertIsInstance(dataset, Dataset) self.assertFalse(dataset.is_training())
def setUpClass(cls): """Loads END dataset and writes files to temp directory""" cls.test_dir = tempfile.mkdtemp() # set up temp directory cls.dataset, _, meta_data = Dataset.load_external('medacy_dataset_end') cls.entities = meta_data['entities'] cls.ann_files = [] # fill directory of training files for data_file in cls.dataset.get_data_files(): file_name, raw_text, ann_text = (data_file.file_name, data_file.raw_path, data_file.ann_path) cls.ann_files.append(file_name + '.ann') with open(join(cls.test_dir, "broken_ann_file.ann"), 'w') as f: f.write("This is clearly not a valid ann file") cls.ann_file_path_one = join(cls.test_dir, "ann1.ann") with open(cls.ann_file_path_one, "w+") as f: f.write(ann_text_one) cls.ann_file_path_two = join(cls.test_dir, "ann1.ann") with open(cls.ann_file_path_one, "w+") as f: f.write(ann_text_two) cls.ann_file_path_modified = join(cls.test_dir, "ann_mod.ann") with open(cls.ann_file_path_modified, "w+") as f: f.write(ann_text_one_modified) cls.ann_file_path_source = join(cls.test_dir, "ann_source.txt") with open(cls.ann_file_path_source, "w+") as f: f.write(ann_text_one_source)
def setUpClass(cls): if importlib.util.find_spec('medacy_dataset_end') is None: raise ImportError( "medacy_dataset_end was not automatically installed for testing. See testing instructions for details." ) cls.training_directory = tempfile.mkdtemp() #set up train directory cls.prediction_directory = tempfile.mkdtemp( ) # set up predict directory dataset, entities = Dataset.load_external('medacy_dataset_end') cls.entities = entities cls.ann_files = [] #fill directory of training files for data_file in dataset.get_data_files(): file_name, raw_text, ann_text = (data_file.file_name, data_file.raw_path, data_file.ann_path) cls.ann_files.append(file_name + '.ann') with open( os.path.join(cls.training_directory, "%s.txt" % file_name), 'w') as f: f.write(raw_text) with open( os.path.join(cls.training_directory, "%s.ann" % file_name), 'w') as f: f.write(ann_text) #place only text files into prediction directory. with open( os.path.join(cls.prediction_directory, "%s.txt" % file_name), 'w') as f: f.write(raw_text)
def setUpClass(cls): if importlib.util.find_spec('medacy_dataset_end') is None: raise ImportError( "medacy_dataset_end was not automatically installed for testing. See testing instructions for details." ) cls.train_dataset, cls.entities = Dataset.load_external( 'medacy_dataset_end') cls.train_dataset.set_data_limit(1) cls.test_dataset, _ = Dataset.load_external('medacy_dataset_end') cls.test_dataset.set_data_limit(2) cls.prediction_directory = tempfile.mkdtemp( ) #directory to store predictions
def setUpClass(cls): if importlib.util.find_spec('medacy_dataset_end') is None: raise ImportError( "medacy_dataset_end was not automatically installed for testing. See testing instructions for details." ) cls.dataset, cls.entities = Dataset.load_external('medacy_dataset_end')
def get_evaluation_dataset(): """ Leave the evaluation folder empty if no evaluation data is provided. :return: a medaCy Dataset object containing this Dataset's designated evaluation data. """ # if evaluation is empty return None. if not resource_isdir(package_name, join('data', 'evaluation')) \ or not resource_listdir(package_name, join('data', 'evaluation')): return None return Dataset(resource_filename(package_name, join('data', 'evaluation')))
def setUpClass(cls): """ Loads END dataset and writes files to temp directory :return: """ cls.test_dir = tempfile.mkdtemp() # set up temp directory cls.dataset, cls.entities = Dataset.load_external('medacy_dataset_end') cls.ann_files = [] # fill directory of training files for data_file in cls.dataset.get_data_files(): file_name, raw_text, ann_text = (data_file.file_name, data_file.raw_path, data_file.ann_path) cls.ann_files.append(file_name + '.ann') with open(join(cls.test_dir, "broken_ann_file.ann"), 'w') as f: f.write("This is clearly not a valid ann file")
def get_training_dataset(): """ :return: a medaCy Dataset object containing this Dataset's designated training data. """ return Dataset(resource_filename(package_name, 'data/training'))
from medacy.data import Dataset from medacy.pipelines import SystematicReviewPipeline from medacy.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['ADR', 'Indication', 'Drug'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset( '/home/mahendrand/VE/SMM4H/data_smmh4h/task2/training/dataset') #path = '../data_smmh4h/task2/training/dataset_1' #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds=5,
create_directory(dirTrain) print("Fold : ",i) for item in ann_files_1: shutil.copy(dataset1 + '/' + item, dirTrain) for item in ann_files_2: shutil.copy(dataset2 + '/' + item, dirTrain) for item in txt_files_1: shutil.copy(dataset1 + '/' + item, dirTrain) for item in txt_files_2: shutil.copy(dataset2 + '/' + item, dirTrain) for item in ann_files_1[i * num_files:(i + 1) * num_files]: shutil.copy(dataset1 + '/' + item, dirTest) os.remove(dirTrain + '/' + item) for item in txt_files_1[i * num_files:(i + 1) * num_files]: shutil.copy(dataset1 + '/' + item, dirTest) os.remove(dirTrain + '/' + item) training_dataset = Dataset(dirTrain) training_dataset.metamap(metamap) model = Model(pipeline, n_jobs=1) model.fit(training_dataset) # run on a separate testing dataset testing_dataset = Dataset(dirTest) # location to store the predictions model.predict(testing_dataset, prediction_directory = dirPrediction)
# This script demonstrates utilizing medaCy for a full model training/predictive/cross validation use-case. # > python training_predicting.py model_name # Will build a model named model_name with the pipeline and parameters defined below. This script places the model in # it's own directory along the models build log and model/pipeline parameters to keep results easily referencable during run time. # Once a sufficent model is produced, consider wrapping it up into a medaCy compatible model as defined the example guide. from medacy.model import Model from medacy.pipelines import SystematicReviewPipeline from medacy.data import Dataset from medacy.pipeline_components import MetaMap import logging, datetime, time, os, sys train_dataset, evaluation_dataset, entities = Dataset.load_external( 'medacy_dataset_tac_2018') if sys.argv[1] is None: exit(0) #For rapid model prototyping, will train and predict by simply running the script with a model name as a parameter. model_name = sys.argv[1] #name for the model, use underscores model_notes = "notes about the current model" #notes about current model to be stored in a model information file by this script. model_directory = "/home/username/named_entity_recognition/challenges/challenge_n/models/%s" % model_name.replace( " ", '_') if model_name is "" or os.path.isdir(model_directory): print("Model directory already exists, aborting") exit(0) else: os.mkdir(model_directory)
from medacy.data import Dataset # from medacy.ner.pipelines import SystematicReviewPipeline from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model import logging, sys #logging.basicConfig(filename=model_directory+'/build_%cd .log' % current_time,level=logging.DEBUG) #set level=logging.DEBUG for more information logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information # entities = ['Form','Route','Frequency', 'Reason', 'Duration', 'Dosage', 'ADE', 'Strength', 'Drug' ] entities = ['Symptom', 'Drug'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2/symptom') #training_dataset.set_data_limit(10) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = ClinicalPipeline(metamap=None, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction # model.fit(training_dataset) model.cross_validate(num_folds=5, training_dataset=training_dataset, prediction_directory=True, groundtruth_directory=True)
def cross_validate(self, num_folds=10, training_dataset=None, prediction_directory=None): """ Performs k-fold stratified cross-validation using our model and pipeline. If the training dataset and prediction_directory are passed, intermediate predictions during cross validation are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute the prediction ambiguity with the methods present in the Dataset class to support pipeline development without a designated evaluation set. :param num_folds: number of folds to split training data into for cross validation :param training_dataset: Dataset that is being cross validated (optional) :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1") if prediction_directory is not None and training_dataset is None: raise ValueError( "Cannot generated predictions during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) assert self.model is not None, "Cannot cross validate a un-fit model" assert self.X_data is not None and self.y_data is not None, \ "Must have features and labels extracted for cross validation" X_data = self.X_data Y_data = self.y_data medacy_pipeline = self.pipeline cv = SequenceStratifiedKFold(folds=num_folds) named_entities = medacy_pipeline.entities evaluation_statistics = {} fold = 1 for train_indices, test_indices in cv(X_data, Y_data): fold_statistics = {} learner_name, learner = medacy_pipeline.get_learner() X_train = [X_data[index] for index in train_indices] y_train = [Y_data[index] for index in train_indices] X_test = [X_data[index] for index in test_indices] y_test = [Y_data[index] for index in test_indices] logging.info("Training Fold %i", fold) train_data = [x[0] for x in X_train] test_data = [x[0] for x in X_test] learner.fit(train_data, y_train) y_pred = learner.predict(test_data) if prediction_directory is not None: # Dict for storing mapping of sequences to their corresponding file preds_by_document = { filename: [] for filename in list(set([x[2] for x in X_data])) } # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [ sequence[2] for x in range(len(sequence[0])) ] span_indices += [element for element in sequence[1]] predictions = [ element for sentence in y_pred for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(predictions): if predictions[i] == 'O': i += 1 continue entity = predictions[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(predictions) - 1 and predictions[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] preds_by_document[document].append( (entity, first_start, last_end)) i += 1 # Write the metrics for this fold. for label in named_entities: fold_statistics[label] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) fold_statistics[label]['precision'] = precision fold_statistics[label]['recall'] = recall fold_statistics[label]['f1'] = f1 # add averages fold_statistics['system'] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=named_entities) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=named_entities) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=named_entities) fold_statistics['system']['precision'] = precision fold_statistics['system']['recall'] = recall fold_statistics['system']['f1'] = f1 table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in named_entities + ['system']] logging.info( tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) evaluation_statistics[fold] = fold_statistics fold += 1 statistics_all_folds = {} for label in named_entities + ['system']: statistics_all_folds[label] = {} statistics_all_folds[label]['precision_average'] = mean([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_max'] = max([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_min'] = min([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_average'] = mean([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_max'] = max([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_min'] = min([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_average'] = mean([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_max'] = max([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_min'] = min([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in named_entities + ['system']] logging.info("\n" + tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl')) if prediction_directory: # Write annotations generated from cross-validation if isinstance(prediction_directory, str): prediction_directory = prediction_directory else: prediction_directory = training_dataset.data_directory + "/predictions/" if os.path.isdir(prediction_directory): logging.warning("Overwritting existing predictions") else: os.makedirs(prediction_directory) for data_file in training_dataset.get_data_files(): logging.info("Predicting file: %s", data_file.file_name) with open(data_file.raw_path, 'r') as raw_text: doc = medacy_pipeline.spacy_pipeline.make_doc( raw_text.read()) preds = preds_by_document[data_file.file_name] annotations = construct_annotations_from_tuples(doc, preds) annotations.to_ann(write_location=os.path.join( prediction_directory, data_file.file_name + ".ann")) return Dataset(data_directory=prediction_directory)
from medacy.data import Dataset import logging,sys from pprint import pprint # print logs # logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types # entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC') prediction_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC/predictions') ambiguity_dict = training_dataset.compute_ambiguity(prediction_dataset) #pprint(ambiguity_dict) entities, confusion_matrix = training_dataset.compute_confusion_matrix(prediction_dataset, leniency=1) pprint(training_dataset.compute_counts()) print(entities) pprint(confusion_matrix)
from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['Symptom', 'Drug'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2_END/symptom') #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = ClinicalPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds=5,
from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['treatment', 'problem', 'test'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/samantha/Desktop/Research/Data/i2b2/data') #path = '../data_smmh4h/task2/training/dataset_1' #set metamap path # metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) # training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = ClinicalPipeline(metamap=None, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds=5, training_dataset=training_dataset, prediction_directory=True,
from medacy.data import Dataset from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging,sys # print logs logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['treatment', 'problem','test'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/Data/i2b2/data') #path = '../data_smmh4h/task2/training/dataset_1' #set metamap path metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = ClinicalPipeline(metamap=metamap, entities=entities) model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True) #location to store the clinical model model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')
from medacy.data import Dataset from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['ADE', 'Drug', 'Dose'] training_dataset = Dataset('/home/mahendrand/VE/Data/MADE/training') #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) pipeline = ClinicalPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) #cross validation
from medacy.data import Dataset from medacy.ner.pipelines import SystematicReviewPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging,sys # print logs # logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint','EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC') #set metamap path metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities) model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds = 5, dataset = training_dataset, write_predictions=True) #location to store the clinical model model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle') #location to store the predictions
from medacy.data import Dataset from medacy.ner.pipelines import SystematicReviewPipeline from medacy.ner.model import Model import logging,sys #logging.basicConfig(filename=model_directory+'/build_%cd .log' % current_time,level=logging.DEBUG) #set level=logging.DEBUG for more information logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information # entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint','EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/TAC/sample') #training_dataset.set_data_limit(10) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = SystematicReviewPipeline(metamap=None, entities=entities) model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction # model.fit(training_dataset) model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True) model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle') # model.predict(training_dataset, prediction_directory='/home/mahendrand/VE/data_smmh4h/task2/training/metamap_predictions') # model.predict(training_dataset)
from medacy.data import Dataset from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging,sys # print logs logging.basicConfig(stream=sys.stdout,level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['ADR','Drug', 'Symptom'] training_dataset = Dataset('/home/mahendrand/VE/Data/CADEC/converted') #set metamap path metamap = MetaMap(metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) pipeline = ClinicalPipeline(metamap=metamap, entities=entities) model = Model(pipeline, n_jobs=1) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) #cross validation model.cross_validate(num_folds = 5, training_dataset = training_dataset, prediction_directory=True, groundtruth_directory=True) #location to store the clinical model # model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle')
# fold1 # training_dataset_1 = Dataset('/home/mahendrand/VE/Data/CADEC_END/1/train') # # training_dataset_1.metamap(metamap) # # model_1 = Model(pipeline, n_jobs=1) # model_1.fit(training_dataset_1) # #run on a separate testing dataset # testing_dataset_1= Dataset('/home/mahendrand/VE/Data/CADEC_END/1/test') # # location to store the predictions # model.predict(testing_dataset_1, prediction_directory='/home/mahendrand/VE/Data/preds/5 fold/CADEC_END') # # # #fold 2 training_dataset_2 = Dataset('/home/mahendrand/VE/Data/CADEC_END/2/train') # training_dataset_2.metamap(metamap) # model_2 = Model(pipeline, n_jobs=1) model_2.fit(training_dataset_2) #run on a separate testing dataset testing_dataset_2 = Dataset('/home/mahendrand/VE/Data/CADEC_END/2/test') # location to store the predictions model.predict( testing_dataset_2, prediction_directory='/home/mahendrand/VE/Data/preds/5 fold/CADEC_END') # # # #fold 3 # training_dataset_3 = Dataset('/home/mahendrand/VE/Data/CADEC_END/3/train')
from medacy.data import Dataset from medacy.ner.pipelines import ClinicalPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['ADE', 'Drug', 'Reason'] training_dataset = Dataset('/home/mahendrand/VE/Data/END/drug') #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) pipeline = ClinicalPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) #cross validation
from medacy.ner.pipelines import SystematicReviewPipeline from medacy.ner.model import Model from medacy.pipeline_components import MetaMap import logging, sys # print logs logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['Reason', 'ADE', 'Drug'] # entities = ['Symptom', 'Form', 'Route', 'Frequency', 'Duration', 'Dosage', 'Strength', 'Drug'] # dirPred = '/home/mahendrand/VE/Predictions/CV/N2C2' training_dataset = Dataset('/home/mahendrand/VE/Data/N2C2/data') #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = ClinicalPipeline(metamap=metamap, entities=entities) pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset)
logging.basicConfig( stream=sys.stdout, level=logging.DEBUG) #set level=logging.DEBUG for more information # entities = ['CellLine','Dose','DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] entities = [ 'CellLine', 'Dose', 'DoseDuration', 'DoseDurationUnits', 'DoseFrequency', 'DoseRoute', 'DoseUnits', 'Endpoint', 'EndpointUnitOfMeasure', 'GroupName', 'GroupSize', 'SampleSize', 'Sex', 'Species', 'Strain', 'TestArticle', 'TestArticlePurity', 'TestArticleVerification', 'TimeAtDose', 'TimeAtFirstDose', 'TimeAtLastDose', 'TimeEndpointAssessed', 'TimeUnits', 'Vehicle' ] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset('/home/mahendrand/VE/TAC/data_TAC') #training_dataset.set_data_limit(10) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = ClinicalPipeline(metamap=None, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction # model.fit(training_dataset) model.cross_validate(num_folds=5, training_dataset=training_dataset, prediction_directory=True, groundtruth_directory=True)