def extract_features(tagged_essays, extractors=None, window_size=None, min_sentence_length=None, folder=None, min_df=None, remove_infrequent=None, spelling_correct=None, replace_nums=None, stem=None, remove_stop_words=None, remove_punctuation=None, lower_case=None, include_vague=None, include_normal=None): feature_extractor = FeatureExtractorTransformer(extractors) return feature_extractor.transform(tagged_essays)
# not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk # Collapse all variants of a tag into one tag feature_extractor = FeatureExtractorTransformer(extractors) essay_feats = feature_extractor.transform(tagged_essays) logger.info("Features loaded") with open(serialized_essays, "w+") as f_essays: pickle.dump(tagged_essays, f_essays) with open(serialized_features, "w+") as f_feats: pickle.dump(essay_feats, f_feats) logger.info("Serialized")
logger = logging.getLogger() # not hashed as don't affect persistence of feature processing config = get_config(data) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays(**config) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk # Collapse all variants of a tag into one tag feature_extractor = FeatureExtractorTransformer(extractors) essay_feats = feature_extractor.transform(tagged_essays) logger.info("Features loaded") with open(serialized_essays, "w+") as f_essays: pickle.dump(tagged_essays, f_essays) with open(serialized_features, "w+") as f_feats: pickle.dump(essay_feats, f_feats) logger.info("Serialized")
class Annotator(object): @classmethod def from_config(cls, config_file): cfg = Config(config_file) return Annotator(cfg.models_folder, cfg.essays_folder, cfg.spell_check_dict) def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier") def annotate(self, essay_text): try: sentences = sent_tokenize(essay_text.strip()) contents = "\n".join(sentences) essay = Essay(full_path=None, include_vague=self.config["include_vague"], include_normal=self.config["include_normal"], load_annotations=False, essay_text=contents) processed_essays = process_essays(essays=[essay], spelling_corrector=self.spelling_corrector, wd_sent_freq=self.wd_sent_freq, remove_infrequent=self.config["remove_infrequent"], spelling_correct=self.config["spelling_correct"], replace_nums=self.config["replace_nums"], stem=self.config["stem"], remove_stop_words=self.config["remove_stop_words"], remove_punctuation=self.config["remove_punctuation"], lower_case=self.config["lower_case"]) self.logger.info("Essay loaded successfully") essays_TD = self.feature_extractor.transform(processed_essays) wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD) xs = self.feature_transformer.transform(wd_feats) wd_predictions_by_code = test_classifier_per_code(xs, self.tag_2_wd_classifier, self.wd_test_tags) dummy_wd_td_ys_bytag = defaultdict(lambda: np.asarray([0.0] * xs.shape[0])) sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(self.sent_input_feat_tags, self.sent_input_interaction_tags, essays_TD, xs, dummy_wd_td_ys_bytag, self.tag_2_wd_classifier, sparse=True, look_back=0) """ Test Stack Classifier """ sent_predictions_by_code = test_classifier_per_code(sent_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags) """ Generate Return Values """ essay_tags = self.__get_essay_tags_(sent_predictions_by_code) essay_type = None if "coral" in self.essays_folder.lower(): essay_type = "CB" elif "skin" in self.essays_folder.lower(): essay_type = "SC" else: raise Exception("Unknown essay type") raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key)) t_words = self.__get_tagged_words_(essay, essays_TD[0], wd_predictions_by_code) t_sentences = self.__get_tagged_sentences_(essay, sent_predictions_by_code) tagged_sentences = [t_sent.add_word_tags(map(lambda twd: twd.__dict__, t_wds)).__dict__ for t_sent, t_wds in zip(t_sentences, t_words)] essay_codes, essay_causal = self.__format_essay_tags_(essay_tags) return {"tagged_sentences" : tagged_sentences, "essay_codes" : essay_codes, "essay_causal" : essay_causal, "essay_category" : essay_category(raw_essay_tags, essay_type), "raw_essay_tags" : raw_essay_tags } except Exception as x: self.logger.exception("An exception occured while annotating essay") return {"error": format_exc()} pass def __set_tags_(self, tagged_essays): MIN_TAG_FREQ = 5 tag_freq = defaultdict(int) for essay in tagged_essays: for sentence in essay.tagged_sentences: un_tags = set() for word, tags in sentence: for tag in tags: if "5b" in tag: continue if (tag[-1].isdigit() or tag in {"Causer", "explicit", "Result"} \ or tag.startswith("Causer") or tag.startswith("Result") \ or tag.startswith("explicit") or "->" in tag) \ and not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag): # if not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag): un_tags.add(tag) for tag in un_tags: tag_freq[tag] += 1 all_tags = list(tag_freq.keys()) freq_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ))) non_causal = [t for t in freq_tags if "->" not in t] only_causal = [t for t in freq_tags if "->" in t] CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] """ works best with all the pair-wise causal relation codes """ # Include all tags for the output self.wd_test_tags = list(set(all_tags + CAUSE_TAGS)) # tags from tagging model used to train the stacked model self.sent_input_feat_tags = list(set(freq_tags + CAUSE_TAGS)) # find interactions between these predicted tags from the word tagger to feed to the sentence tagger self.sent_input_interaction_tags = list(set(non_causal + CAUSE_TAGS)) # tags to train (as output) for the sentence based classifier self.sent_output_train_test_tags = list(set(all_tags + CAUSE_TAGS + CAUSAL_REL_TAGS)) def __is_tag_to_return_(self, tag): return tag[0].isdigit() or ("->" in tag and "Causer" in tag) def __get_regular_tags_(self, pred_tags): r_tags = sorted(filter(lambda t: t[0].isdigit() and "->" not in t, pred_tags), key=lambda s: (int(s), s) if s.isdigit() else ((-1, s))) str_r_tags = ",".join(r_tags) return str_r_tags def __get_causal_tags_(self, pred_tags): c_tags = sorted(filter(lambda t: "->" in t, pred_tags), key=cr_sort_key) str_c_tags = ",".join(c_tags) return str_c_tags def __get_tagged_sentences_(self, essay, sent_predictions_by_code): tagged_sents = [] for i, sent in enumerate(essay.tagged_sentences): wds, _ = zip(*sent) str_sent = " ".join(wds) pred_tags = set() for tag, array in sent_predictions_by_code.items(): if self.__is_tag_to_return_(tag): if np.max(array[i]) == 1: pred_tags.add(friendly_tag(tag)) str_r_tags = self.__get_regular_tags_(pred_tags) str_c_tags = self.__get_causal_tags_(pred_tags) tagged_sents.append(TaggedSentence(str_sent, str_r_tags, str_c_tags )) return tagged_sents def __get_essay_tags_(self, sent_predictions_by_code): tags = set() for tag, array in sent_predictions_by_code.items(): if np.max(array) == 1: tags.add(tag) return tags def __format_essay_tags_(self, tags): tags = map(lambda s: friendly_tag(s), filter(lambda t: self.__is_tag_to_return_(t), tags)) str_r_tags = self.__get_regular_tags_(tags) str_c_tags = self.__get_causal_tags_(tags) if not str_r_tags: return "", str_c_tags elif not str_c_tags: return str_r_tags, "" else: return str_r_tags, str_c_tags def __fuzzy_match_(self, original, feat_wd): original = original.lower().strip() feat_wd = feat_wd.lower().strip() if original == feat_wd: return True if original[:3] == feat_wd[:3]: return True a = set(original) b = set(feat_wd) jaccard = float(len(a.intersection(b))) / float(len(a.union(b))) return jaccard >= 0.5 def __align_wd_tags_(self, orig, feats): """ Once processed, there may be a different number of words than in the original sentence Try and recover the tags for the original words by aligning the two using simple heuristics """ if len(orig) < len(feats): raise Exception("align_wd_tags() : Original sentence is longer!") o_wds, _ = zip(*orig) feat_wds, new_tags = zip(*feats) if len(orig) == len(feats): return zip(o_wds, new_tags) #here orig is longer than feats diff = len(orig) - len(feats) tagged_wds = [] feat_offset = 0 while len(tagged_wds) < len(o_wds): i = len(tagged_wds) orig_wd = o_wds[i] print(i, orig_wd) if i >= len(feats): tagged_wds.append((orig_wd, new_tags[-1])) continue else: new_tag_ix = i - feat_offset feat_wd = feats[new_tag_ix][0] if feat_wd == "INFREQUENT" or feat_wd.isdigit(): tagged_wds.append((orig_wd, new_tags[new_tag_ix])) continue new_tagged_wds = [] found = False for j in range(i, i + diff + 1): new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix])) next_orig_wd = o_wds[j] if self.__fuzzy_match_(next_orig_wd, feat_wd): found = True tagged_wds.extend(new_tagged_wds) feat_offset += len(new_tagged_wds) - 1 break if not found: raise Exception("No matching word found for index:%i and processed word:%s" % (i, feat_wd)) return tagged_wds def __get_tagged_words_(self, original_essay, essay_TD, wd_predictions_by_code): tagged_sents = [] # should be a one to one correspondance between words in essays_TD[0] and predictions i = 0 for sent_ix, sent in enumerate(essay_TD.sentences): tmp_tagged_wds = [] for wix, (feat) in enumerate(sent): word = feat.word tags = set() for tag in wd_predictions_by_code.keys(): if wd_predictions_by_code[tag][i] > 0: tags.add(tag) i += 1 tmp_tagged_wds.append((word, tags)) # Now allign the predicted tags with the original words wds, aligned_tags = zip(*self.__align_wd_tags_(original_essay.tagged_sentences[sent_ix], tmp_tagged_wds)) # spelling correct (needs to be after alignment) fr_aligned_tags = map(lambda tags: set(map(friendly_tag, tags)), aligned_tags) tagged_words = list(zip(wds, fr_aligned_tags)) tagged_sents.append([TaggedWord(wd, self.spelling_corrector.correct(wd), self.__get_regular_tags_(tags), self.__get_causal_tags_(tags)) for wd, tags in tagged_words]) return tagged_sents
class Annotator(object): @classmethod def from_config(cls, config_file): cfg = Config(config_file) return Annotator(cfg.models_folder, cfg.essays_folder, cfg.spell_check_dict) def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector( tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed( offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier") def annotate(self, essay_text): try: sentences = sent_tokenize(essay_text.strip()) contents = "\n".join(sentences) essay = Essay(full_path=None, include_vague=self.config["include_vague"], include_normal=self.config["include_normal"], load_annotations=False, essay_text=contents) processed_essays = process_essays( essays=[essay], spelling_corrector=self.spelling_corrector, wd_sent_freq=self.wd_sent_freq, remove_infrequent=self.config["remove_infrequent"], spelling_correct=self.config["spelling_correct"], replace_nums=self.config["replace_nums"], stem=self.config["stem"], remove_stop_words=self.config["remove_stop_words"], remove_punctuation=self.config["remove_punctuation"], lower_case=self.config["lower_case"]) self.logger.info("Essay loaded successfully") essays_TD = self.feature_extractor.transform(processed_essays) wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD) xs = self.feature_transformer.transform(wd_feats) wd_predictions_by_code = test_classifier_per_code( xs, self.tag_2_wd_classifier, self.wd_test_tags) dummy_wd_td_ys_bytag = defaultdict( lambda: np.asarray([0.0] * xs.shape[0])) sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model( self.sent_input_feat_tags, self.sent_input_interaction_tags, essays_TD, xs, dummy_wd_td_ys_bytag, self.tag_2_wd_classifier, sparse=True, look_back=0) """ Test Stack Classifier """ sent_predictions_by_code = test_classifier_per_code( sent_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags) """ Generate Return Values """ essay_tags = self.__get_essay_tags_(sent_predictions_by_code) essay_type = None if "coral" in self.essays_folder.lower(): essay_type = "CB" elif "skin" in self.essays_folder.lower(): essay_type = "SC" else: raise Exception("Unknown essay type") raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key)) t_words = self.__get_tagged_words_(essay, essays_TD[0], wd_predictions_by_code) t_sentences = self.__get_tagged_sentences_( essay, sent_predictions_by_code) tagged_sentences = [ t_sent.add_word_tags(map(lambda twd: twd.__dict__, t_wds)).__dict__ for t_sent, t_wds in zip(t_sentences, t_words) ] essay_codes, essay_causal = self.__format_essay_tags_(essay_tags) return { "tagged_sentences": tagged_sentences, "essay_codes": essay_codes, "essay_causal": essay_causal, "essay_category": essay_category(raw_essay_tags, essay_type), "raw_essay_tags": raw_essay_tags } except Exception as x: self.logger.exception( "An exception occured while annotating essay") return {"error": format_exc()} pass def __set_tags_(self, tagged_essays): MIN_TAG_FREQ = 5 tag_freq = defaultdict(int) for essay in tagged_essays: for sentence in essay.tagged_sentences: un_tags = set() for word, tags in sentence: for tag in tags: if "5b" in tag: continue if (tag[-1].isdigit() or tag in {"Causer", "explicit", "Result"} \ or tag.startswith("Causer") or tag.startswith("Result") \ or tag.startswith("explicit") or "->" in tag) \ and not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag): # if not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag): un_tags.add(tag) for tag in un_tags: tag_freq[tag] += 1 all_tags = list(tag_freq.keys()) freq_tags = list( set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ))) non_causal = [t for t in freq_tags if "->" not in t] only_causal = [t for t in freq_tags if "->" in t] CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL] # + ["explicit"] """ works best with all the pair-wise causal relation codes """ # Include all tags for the output self.wd_test_tags = list(set(all_tags + CAUSE_TAGS)) # tags from tagging model used to train the stacked model self.sent_input_feat_tags = list(set(freq_tags + CAUSE_TAGS)) # find interactions between these predicted tags from the word tagger to feed to the sentence tagger self.sent_input_interaction_tags = list(set(non_causal + CAUSE_TAGS)) # tags to train (as output) for the sentence based classifier self.sent_output_train_test_tags = list( set(all_tags + CAUSE_TAGS + CAUSAL_REL_TAGS)) def __is_tag_to_return_(self, tag): return tag[0].isdigit() or ("->" in tag and "Causer" in tag) def __get_regular_tags_(self, pred_tags): r_tags = sorted(filter(lambda t: t[0].isdigit() and "->" not in t, pred_tags), key=lambda s: (int(s), s) if s.isdigit() else ((-1, s))) str_r_tags = ",".join(r_tags) return str_r_tags def __get_causal_tags_(self, pred_tags): c_tags = sorted(filter(lambda t: "->" in t, pred_tags), key=cr_sort_key) str_c_tags = ",".join(c_tags) return str_c_tags def __get_tagged_sentences_(self, essay, sent_predictions_by_code): tagged_sents = [] for i, sent in enumerate(essay.tagged_sentences): wds, _ = zip(*sent) str_sent = " ".join(wds) pred_tags = set() for tag, array in sent_predictions_by_code.items(): if self.__is_tag_to_return_(tag): if np.max(array[i]) == 1: pred_tags.add(friendly_tag(tag)) str_r_tags = self.__get_regular_tags_(pred_tags) str_c_tags = self.__get_causal_tags_(pred_tags) tagged_sents.append( TaggedSentence(str_sent, str_r_tags, str_c_tags)) return tagged_sents def __get_essay_tags_(self, sent_predictions_by_code): tags = set() for tag, array in sent_predictions_by_code.items(): if np.max(array) == 1: tags.add(tag) return tags def __format_essay_tags_(self, tags): tags = map(lambda s: friendly_tag(s), filter(lambda t: self.__is_tag_to_return_(t), tags)) str_r_tags = self.__get_regular_tags_(tags) str_c_tags = self.__get_causal_tags_(tags) if not str_r_tags: return "", str_c_tags elif not str_c_tags: return str_r_tags, "" else: return str_r_tags, str_c_tags def __fuzzy_match_(self, original, feat_wd): original = original.lower().strip() feat_wd = feat_wd.lower().strip() if original == feat_wd: return True if original[:3] == feat_wd[:3]: return True a = set(original) b = set(feat_wd) jaccard = float(len(a.intersection(b))) / float(len(a.union(b))) return jaccard >= 0.5 def __align_wd_tags_(self, orig, feats): """ Once processed, there may be a different number of words than in the original sentence Try and recover the tags for the original words by aligning the two using simple heuristics """ if len(orig) < len(feats): raise Exception("align_wd_tags() : Original sentence is longer!") o_wds, _ = zip(*orig) feat_wds, new_tags = zip(*feats) if len(orig) == len(feats): return zip(o_wds, new_tags) #here orig is longer than feats diff = len(orig) - len(feats) tagged_wds = [] feat_offset = 0 while len(tagged_wds) < len(o_wds): i = len(tagged_wds) orig_wd = o_wds[i] print(i, orig_wd) if i >= len(feats): tagged_wds.append((orig_wd, new_tags[-1])) continue else: new_tag_ix = i - feat_offset feat_wd = feats[new_tag_ix][0] if feat_wd == "INFREQUENT" or feat_wd.isdigit(): tagged_wds.append((orig_wd, new_tags[new_tag_ix])) continue new_tagged_wds = [] found = False for j in range(i, i + diff + 1): new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix])) next_orig_wd = o_wds[j] if self.__fuzzy_match_(next_orig_wd, feat_wd): found = True tagged_wds.extend(new_tagged_wds) feat_offset += len(new_tagged_wds) - 1 break if not found: raise Exception( "No matching word found for index:%i and processed word:%s" % (i, feat_wd)) return tagged_wds def __get_tagged_words_(self, original_essay, essay_TD, wd_predictions_by_code): tagged_sents = [] # should be a one to one correspondance between words in essays_TD[0] and predictions i = 0 for sent_ix, sent in enumerate(essay_TD.sentences): tmp_tagged_wds = [] for wix, (feat) in enumerate(sent): word = feat.word tags = set() for tag in wd_predictions_by_code.keys(): if wd_predictions_by_code[tag][i] > 0: tags.add(tag) i += 1 tmp_tagged_wds.append((word, tags)) # Now allign the predicted tags with the original words wds, aligned_tags = zip(*self.__align_wd_tags_( original_essay.tagged_sentences[sent_ix], tmp_tagged_wds)) # spelling correct (needs to be after alignment) fr_aligned_tags = map(lambda tags: set(map(friendly_tag, tags)), aligned_tags) tagged_words = list(zip(wds, fr_aligned_tags)) tagged_sents.append([ TaggedWord(wd, self.spelling_corrector.correct(wd), self.__get_regular_tags_(tags), self.__get_causal_tags_(tags)) for wd, tags in tagged_words ]) return tagged_sents