train_config["window_size"] = 9 offset = (train_config["window_size"] - 1) / 2 test_config = dict(train_config.items()) test_config["folder"] = test_folder """ Load Data """ train_tagged_essays = load_process_essays(**train_config) test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded - Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- """ Define Tags """ tag_freq = get_tag_freq(train_tagged_essays) regular_tags = list( set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ unigram_window_stemmed = fact_extract_positional_word_features(offset, True) biigram_window_stemmed = fact_extract_ngram_features(offset=offset, ngram_size=2, stem_words=True) trigram_window_stemmed = fact_extract_ngram_features(offset=offset, ngram_size=3, stem_words=True) unigram_bow_window = fact_extract_ngram_features(offset=offset, ngram_size=1, positional=False,
test_folder = root_folder + "Test/" train_config = get_config(training_folder) test_config = dict(train_config.items()) test_config["folder"] = test_folder train_tagged_essays = load_process_essays(**train_config) test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded - Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(train_tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = [(train_tagged_essays, test_tagged_essays)] results = Parallel(n_jobs=len(folds))( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results: wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(folder) print(config) mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") len(tagged_essays) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ config["window_size"] = 11 offset = (config["window_size"] - 1) / 2 cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(tagged_essays, CV_FOLDS) results = Parallel(n_jobs=CV_FOLDS)( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results:
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/" folder = root_folder + "Training/" processed_essay_filename_prefix = root_folder + "Pickled/essays_proc_pickled_" config = get_config(folder) print(config) mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays) tagged_essays = mem_process_essays(**config) logger.info("Essays loaded") len(tagged_essays) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ config["window_size"] = 9 offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features(offset, True) biigram_window_stemmed = fact_extract_ngram_features(offset=offset, ngram_size=2, stem_words=True) trigram_window_stemmed = fact_extract_ngram_features(offset=offset, ngram_size=3, stem_words=True) unigram_bow_window = fact_extract_ngram_features(offset=offset, ngram_size=1, positional=False, stem_words=False) extractors = [ unigram_bow_window, unigram_window_stemmed,