def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier")
CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" features_filename_prefix = root_folder + "Pickled/feats_pickled_" config = get_config(folder) """ Load Essays """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) logger.info("Essays loaded") """ End load Essays """ def evaluate_window_size(config, window_size, features_filename_prefix): config["window_size"] = window_size """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window = fact_extract_positional_word_features(offset)
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" coref_root = root_folder + "CoReference/" coref_output_folder = coref_root + "CRel/" train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill" with open(train_fname, "rb") as f: pred_tagged_essays_train = dill.load(f) test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill" with open(test_fname, "rb") as f: pred_tagged_essays_test = dill.load(f) # Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels config = get_config(coref_output_folder) results_processor = ResultsProcessor(dbname="metrics_causal_model") ######################################################## logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays_train)) # should be 902 cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test) cv_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)] # type: List[Tuple[Any,Any]] def evaluate_model( collection_prefix: str, folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], cost_function_name: str, beta: float,
# Data Set Partition CV_FOLDS = 5 MIN_FEAT_FREQ = 5 # Global settings settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" training_folder = root_folder + "Training" + "/" test_folder = root_folder + "Test" + "/" training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl" # NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook # used as inputs to parsing model rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/" config = get_config(training_folder) # Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels test_config = get_config(test_folder) tagged_essays_test = load_process_essays(**test_config) ######################################################## fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill" with open(fname, "rb") as f: pred_tagged_essays = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays)) # should be 902
MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags STEM = True # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(folder) print(config) mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") len(tagged_essays) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ config["window_size"] = 11
from featureextractortransformer import FeatureExtractorTransformer from load_data import load_process_essays from featureextractionfunctions import * from window_based_tagger_config import get_config import cPickle as pickle import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger() # not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk
# Data Set Partition CV_FOLDS = 5 MIN_FEAT_FREQ = 5 # Global settings settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" training_folder = root_folder + "Training" + "/" test_folder = root_folder + "Test" + "/" training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl" # NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook # used as inputs to parsing model rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/" config = get_config(training_folder) results_processor = ResultsProcessor(dbname="metrics_causal") # Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels test_config = get_config(test_folder) tagged_essays_test = load_process_essays(**test_config) ######################################################## fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill" with open(fname, "rb") as f: pred_tagged_essays = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays)) # should be 902
MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" training_folder = root_folder + "Training/" test_folder = root_folder + "Test/" train_config = get_config(training_folder) """ FEATURE EXTRACTION """ train_config["window_size"] = 9 offset = (train_config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) triigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3) unigram_bow_window = fact_extract_bow_ngram_features(offset, 1) #optimal SC feature set extractors = [unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed,
MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" training_folder = root_folder + "Training/" test_folder = root_folder + "Test/" models_folder = settings.data_directory + "CoralBleaching/models/CRF" """ Load Configs """ train_config = get_config(training_folder) train_config["window_size"] = 9 offset = (train_config["window_size"] - 1) / 2 test_config = dict(train_config.items()) test_config["folder"] = test_folder """ Load Data """ train_tagged_essays = load_process_essays(**train_config) test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded - Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- """ Define Tags """
from FindFiles import find_files from Settings import Settings from load_data import load_process_essays from window_based_tagger_config import get_config import dill CV_FOLDS = 5 DEV_SPLIT = 0.1 settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" partition = "Training" # Training | Test target_folder = root_folder + partition + "/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(target_folder) # override this so we don't replace INFREQUENT words #config["min_df"] = 0 mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) print("{0} essays loaded".format(len(tagged_essays))) coref_root = root_folder + "CoReference/" coref_folder = coref_root + partition coref_files = find_files(coref_folder, ".*\.tagged") print("{0} co-ref tagged files loaded".format(len(coref_files))) assert len(coref_files) == len(tagged_essays)
from featureextractortransformer import FeatureExtractorTransformer from load_data import load_process_essays from featureextractionfunctions import * from window_based_tagger_config import get_config import cPickle as pickle import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger() # not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk # Collapse all variants of a tag into one tag feature_extractor = FeatureExtractorTransformer(extractors)
MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() model_store = ModelStore() """ PETER - CHANGE THESE FILE PATHS """ folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" # Location where the training data is, use EBA_Pre and Post test essays preferably test_folder= settings.data_directory + "CoralBleaching/BrattData/Merged/" # Location where the new essays to tag are located out_predictions_file = settings.data_directory + "CoralBleaching/Results/predictions.txt" # File to dump the predictions to config = get_config(folder) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) #pos_tag_window = fact_extract_positional_POS_features(offset) #pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset) #head_wd_window = fact_extract_positional_head_word_features(offset) #pos_dep_vecs = fact_extract_positional_dependency_vectors(offset) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)])
LOOK_BACK = 0 # how many sentences to look back when predicting tags settings = Settings.Settings() root = settings.data_directory + "/GlobalWarming/BrattFiles/merged/" """ INPUT - two serialized files, one for the pre-processed essays, the other for the features """ """ OUTPUT """ processed_essay_filename_prefix = root + "Pickled/essays_proc_pickled_" features_filename_prefix = root + "Pickled/feats_pickled_" out_predictions_file = root + "Experiment/Output/predictions.txt" out_predicted_margins_file = root + "Experiment/Output/predicted_confidence.txt" out_metrics_file = root + "Experiment/Output/metrics.txt" out_categories_file = root + "Experiment/Output/categories.txt" config = get_config(root) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk( filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) # replace periods in tags so that we can store results in mongo replace_periods(tagged_essays) logger.info("Essays loaded")
CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() model_store = ModelStore() """ PETER - CHANGE THESE FILE PATHS """ folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" # Location where the training data is, use EBA_Pre and Post test essays preferably test_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/" # Location where the new essays to tag are located out_predictions_file = settings.data_directory + "CoralBleaching/Results/predictions.txt" # File to dump the predictions to config = get_config(folder) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) #pos_tag_window = fact_extract_positional_POS_features(offset) #pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset) #head_wd_window = fact_extract_positional_head_word_features(offset) #pos_dep_vecs = fact_extract_positional_dependency_vectors(offset) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays(**config)
SPARSE_WD_FEATS = True MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed settings = Settings() root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/" partition = "Training" # Training | Test target_folder = root_folder + partition + "/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(target_folder) # LOAD ESSAYS mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) # map parsed essays to essay name print("{0} essays loaded".format(len(tagged_essays))) # LOAD COREF RESULTS coref_root = root_folder + "CoReference/" coref_folder = coref_root + partition coref_files = find_files(coref_folder, ".*\.tagged") print("{0} co-ref tagged files loaded".format(len(coref_files))) assert len(coref_files) == len(tagged_essays)
settings = Settings.Settings() root = settings.data_directory + "/GlobalWarming/BrattFiles/merged/" """ INPUT - two serialized files, one for the pre-processed essays, the other for the features """ """ OUTPUT """ processed_essay_filename_prefix = root + "Pickled/essays_proc_pickled_" features_filename_prefix = root + "Pickled/feats_pickled_" out_predictions_file = root + "Experiment/Output/predictions.txt" out_predicted_margins_file = root + "Experiment/Output/predicted_confidence.txt" out_metrics_file = root + "Experiment/Output/metrics.txt" out_categories_file = root + "Experiment/Output/categories.txt" config = get_config(root) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays( **config ) # replace periods in tags so that we can store results in mongo replace_periods(tagged_essays)