def load_process_essays(window_size, min_sentence_length, folder, min_df, remove_infrequent, spelling_correct, replace_nums, stem, remove_stop_words, remove_punctuation, lower_case, include_vague, include_normal): essays = load_bratt_essays(directory=folder, include_vague=include_vague, include_normal=include_normal) return process_essays(essays, min_df=min_df, remove_infrequent=remove_infrequent, spelling_correct=spelling_correct, replace_nums=replace_nums, stem=stem, remove_stop_words=remove_stop_words, remove_punctuation=remove_punctuation, lower_case=lower_case, spelling_corrector=None)
def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector( tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed( offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier")
def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier")
def load_essays(include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL): return load_bratt_essays(directory=folder, include_vague=include_vague, include_normal=include_normal)
lower_case=True): """ Flattens the processed essays by extracting just the sentences from the esays """ processed_essays = process_essays(essays, min_df=min_df, remove_infrequent=remove_infrequent, spelling_correct=spelling_correct, replace_nums=replace_nums, stem=stem, remove_stop_words=remove_stop_words, remove_punctuation=remove_punctuation, lower_case=lower_case) sentences = [] for essay in processed_essays: for sentence in essay.sentences: sentences.append(sentence) return sentences if __name__ == "__main__": from BrattEssay import load_bratt_essays essays = load_bratt_essays() processed_sentences = process_sentences(essays, stem=True, spelling_correct=False) pass
new_sentence.append((cw, tags)) if len(new_sentence) > 0: lst_sentences.append(new_sentence) return processed_essays def process_sentences(essays, min_df=5, remove_infrequent=False, spelling_correct=True, replace_nums=True, stem=False, remove_stop_words=False, remove_punctuation=True, lower_case=True): """ Flattens the processed essays by extracting just the sentences from the esays """ processed_essays = process_essays(essays, min_df=min_df, remove_infrequent=remove_infrequent, spelling_correct=spelling_correct, replace_nums=replace_nums, stem=stem, remove_stop_words=remove_stop_words, remove_punctuation=remove_punctuation, lower_case=lower_case) sentences = [] for essay in processed_essays: for sentence in essay.sentences: sentences.append(sentence) return sentences if __name__ == "__main__": from BrattEssay import load_bratt_essays essays = load_bratt_essays() processed_sentences = process_sentences(essays, stem=True, spelling_correct=False) pass
None """ Settings """ """ Start Script """ WINDOW_SIZE = 7 #7 is best MID_IX = int(round(WINDOW_SIZE / 2.0) - 1) MIN_SENTENCE_FREQ = 2 PCT_VALIDATION = 0.2 MIN_FEAT_FREQ = 5 #15 best so far PCT_VALIDATION = 0.25 SENTENCE_START = "<START>" SENTENCE_END = "<END>" STEM = True """ Load Essays """ essays = load_bratt_essays( "/Users/simon.hughes/Dropbox/Phd/Data/CoralBleaching/BrattData/Merged/") all_codes = set() all_words = [] CAUSAL_REL = "CRel" RESULT_REL = "RRel" CAUSE_RESULT = "C->R" cr_codes = [CAUSAL_REL, RESULT_REL, CAUSE_RESULT] for essay in essays: for sentence in essay.tagged_sentences: for w, tags in sentence: all_words.append(w) all_codes.update(tags)
""" Settings """ """ Start Script """ WINDOW_SIZE = 7 #7 is best MID_IX = int(round(WINDOW_SIZE / 2.0) - 1) MIN_SENTENCE_FREQ = 2 PCT_VALIDATION = 0.2 MIN_FEAT_FREQ = 5 #15 best so far PCT_VALIDATION = 0.25 SENTENCE_START = "<START>" SENTENCE_END = "<END>" STEM = True """ Load Essays """ essays = load_bratt_essays("/Users/simon.hughes/Dropbox/Phd/Data/CoralBleaching/BrattData/Merged/") all_codes = set() all_words = [] CAUSAL_REL = "CRel" RESULT_REL = "RRel" CAUSE_RESULT = "C->R" cr_codes = [CAUSAL_REL, RESULT_REL, CAUSE_RESULT] for essay in essays: for sentence in essay.tagged_sentences: for w, tags in sentence: all_words.append(w) all_codes.update(tags)