#Write information about model before training with open(model_directory + "/model_information.txt", 'w') as model_info: model_info.write("Entities: [%s]\n" % ", ".join(entities)) model_info.write("Training Files: %i\n" % len(train_dataset.get_data_files())) model_info.write(model_notes + "\n") model_info.write(str(model)) model.fit(train_dataset) #dump fitted model current_time = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H.%M.%S') model.dump(model_directory + "/tac_2018_%s_%s.pkl" % (model_name, current_time)) #If no evaluation data is available, skip this and cross validate the model #predict using trained model evaluation_dataset.metamap( metamap, n_jobs=30) #will automatically skip if dataset is pre-metamapped #predicts over the datasets in evaluation_dataset utilizing the model trained above, then stores those predictions #in a given output directory model.predict(evaluation_dataset, prediction_directory=os.path.join(model_directory, 'predictions/')) #performs sequence stratified cross validation over the trained model. Note that all extracted features are stored in memory while this runs. model.cross_validate(num_folds=10)
level=logging.DEBUG) #set level=logging.DEBUG for more information #entity types entities = ['ADR', 'Indication', 'Drug'] # training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_smm4h_2019') training_dataset = Dataset( '/home/mahendrand/VE/SMM4H/data_smmh4h/task2/training/dataset') #path = '../data_smmh4h/task2/training/dataset_1' #set metamap path metamap = MetaMap( metamap_path="/home/share/programs/metamap/2016/public_mm/bin/metamap", convert_ascii=True) training_dataset.metamap(metamap) # pipeline = SystematicReviewPipeline(metamap=None, entities=meta_data['entities']) pipeline = SystematicReviewPipeline(metamap=metamap, entities=entities) model = Model( pipeline, n_jobs=1 ) #distribute documents between 30 processes during training and prediction model.fit(training_dataset) model.cross_validate(num_folds=5, dataset=training_dataset, write_predictions=True) #location to store the clinical model model.dump('/home/mahendrand/VE/SMM4H/medaCy/medacy/clinical_model.pickle') #location to store the predictions #model.predict(training_dataset, prediction_directory='/home/mahendrand/VE/SMM4H/data_smmh4h/task2/training/dataset/metamap_predictions')