def extract_features(tagged_essays, extractors=None, window_size=None, min_sentence_length=None, folder=None,
                     min_df=None, remove_infrequent=None,
                     spelling_correct=None,
                     replace_nums=None, stem=None, remove_stop_words=None,
                     remove_punctuation=None, lower_case=None,
                     include_vague=None, include_normal=None):
    feature_extractor = FeatureExtractorTransformer(extractors)
    return feature_extractor.transform(tagged_essays)
Пример #2
0
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder,
                                          include_vague=cfg["include_vague"],
                                          include_normal=cfg["include_normal"],
                                          load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(
            tagged_essays,
            self.config["lower_case"],
            self.wd_sent_freq,
            folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(
            offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer = store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
Пример #3
0
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
# not hashed as don't affect persistence of feature processing

config = get_config(data)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
tagged_essays = load_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk

# Collapse all variants of a tag into one tag
feature_extractor = FeatureExtractorTransformer(extractors)

essay_feats = feature_extractor.transform(tagged_essays)
logger.info("Features loaded")

with open(serialized_essays, "w+") as f_essays:
    pickle.dump(tagged_essays, f_essays)

with open(serialized_features, "w+") as f_feats:
    pickle.dump(essay_feats,   f_feats)

logger.info("Serialized")
logger = logging.getLogger()

# not hashed as don't affect persistence of feature processing

config = get_config(data)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])
""" LOAD DATA """
tagged_essays = load_process_essays(**config)
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk

# Collapse all variants of a tag into one tag
feature_extractor = FeatureExtractorTransformer(extractors)

essay_feats = feature_extractor.transform(tagged_essays)
logger.info("Features loaded")

with open(serialized_essays, "w+") as f_essays:
    pickle.dump(tagged_essays, f_essays)

with open(serialized_features, "w+") as f_feats:
    pickle.dump(essay_feats, f_feats)

logger.info("Serialized")
Пример #6
0
class Annotator(object):

    @classmethod
    def from_config(cls, config_file):
        cfg = Config(config_file)
        return Annotator(cfg.models_folder, cfg.essays_folder, cfg.spell_check_dict)

    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")

    def annotate(self, essay_text):

        try:
            sentences = sent_tokenize(essay_text.strip())
            contents = "\n".join(sentences)

            essay = Essay(full_path=None, include_vague=self.config["include_vague"],
                          include_normal=self.config["include_normal"], load_annotations=False, essay_text=contents)

            processed_essays = process_essays(essays=[essay],
                                              spelling_corrector=self.spelling_corrector,
                                              wd_sent_freq=self.wd_sent_freq,
                                              remove_infrequent=self.config["remove_infrequent"],
                                              spelling_correct=self.config["spelling_correct"],
                                              replace_nums=self.config["replace_nums"],
                                              stem=self.config["stem"],
                                              remove_stop_words=self.config["remove_stop_words"],
                                              remove_punctuation=self.config["remove_punctuation"],
                                              lower_case=self.config["lower_case"])

            self.logger.info("Essay loaded successfully")
            essays_TD = self.feature_extractor.transform(processed_essays)

            wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD)
            xs = self.feature_transformer.transform(wd_feats)

            wd_predictions_by_code = test_classifier_per_code(xs, self.tag_2_wd_classifier, self.wd_test_tags)

            dummy_wd_td_ys_bytag = defaultdict(lambda: np.asarray([0.0] * xs.shape[0]))
            sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(self.sent_input_feat_tags,
                                                                                             self.sent_input_interaction_tags,
                                                                                             essays_TD, xs,
                                                                                             dummy_wd_td_ys_bytag,
                                                                                             self.tag_2_wd_classifier,
                                                                                             sparse=True,
                                                                                             look_back=0)

            """ Test Stack Classifier """

            sent_predictions_by_code = test_classifier_per_code(sent_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags)

            """ Generate Return Values """
            essay_tags = self.__get_essay_tags_(sent_predictions_by_code)

            essay_type = None
            if "coral" in self.essays_folder.lower():
                essay_type = "CB"
            elif "skin" in self.essays_folder.lower():
                essay_type = "SC"
            else:
                raise Exception("Unknown essay type")

            raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key))

            t_words = self.__get_tagged_words_(essay, essays_TD[0], wd_predictions_by_code)
            t_sentences = self.__get_tagged_sentences_(essay, sent_predictions_by_code)

            tagged_sentences = [t_sent.add_word_tags(map(lambda twd: twd.__dict__, t_wds)).__dict__
                                for t_sent, t_wds in zip(t_sentences, t_words)]

            essay_codes, essay_causal = self.__format_essay_tags_(essay_tags)
            return {"tagged_sentences"  :   tagged_sentences,

                    "essay_codes"       :   essay_codes,
                    "essay_causal"      :   essay_causal,
                    "essay_category"    :   essay_category(raw_essay_tags, essay_type),

                    "raw_essay_tags"    :   raw_essay_tags
            }
        except Exception as x:
            self.logger.exception("An exception occured while annotating essay")
            return {"error": format_exc()}
        pass

    def __set_tags_(self, tagged_essays):

        MIN_TAG_FREQ = 5

        tag_freq = defaultdict(int)
        for essay in tagged_essays:
            for sentence in essay.tagged_sentences:
                un_tags = set()
                for word, tags in sentence:
                    for tag in tags:
                        if "5b" in tag:
                            continue
                        if      (tag[-1].isdigit() or tag in {"Causer", "explicit", "Result"} \
                                    or tag.startswith("Causer") or tag.startswith("Result") \
                                    or tag.startswith("explicit") or "->" in tag) \
                                and not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            # if not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            un_tags.add(tag)
                for tag in un_tags:
                    tag_freq[tag] += 1

        all_tags = list(tag_freq.keys())
        freq_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ)))
        non_causal = [t for t in freq_tags if "->" not in t]
        only_causal = [t for t in freq_tags if "->" in t]

        CAUSE_TAGS = ["Causer", "Result", "explicit"]
        CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]  # + ["explicit"]

        """ works best with all the pair-wise causal relation codes """
        # Include all tags for the output
        self.wd_test_tags = list(set(all_tags + CAUSE_TAGS))

        # tags from tagging model used to train the stacked model
        self.sent_input_feat_tags = list(set(freq_tags + CAUSE_TAGS))
        # find interactions between these predicted tags from the word tagger to feed to the sentence tagger
        self.sent_input_interaction_tags = list(set(non_causal + CAUSE_TAGS))
        # tags to train (as output) for the sentence based classifier
        self.sent_output_train_test_tags = list(set(all_tags + CAUSE_TAGS + CAUSAL_REL_TAGS))

    def __is_tag_to_return_(self, tag):
        return tag[0].isdigit() or ("->" in tag and "Causer" in tag)

    def __get_regular_tags_(self, pred_tags):
        r_tags = sorted(filter(lambda t: t[0].isdigit() and "->" not in t, pred_tags),
                        key=lambda s: (int(s), s) if s.isdigit() else ((-1, s)))
        str_r_tags = ",".join(r_tags)
        return str_r_tags

    def __get_causal_tags_(self, pred_tags):
        c_tags = sorted(filter(lambda t: "->" in t, pred_tags), key=cr_sort_key)
        str_c_tags = ",".join(c_tags)
        return str_c_tags

    def __get_tagged_sentences_(self, essay, sent_predictions_by_code):
        tagged_sents = []
        for i, sent in enumerate(essay.tagged_sentences):
            wds, _ = zip(*sent)
            str_sent = " ".join(wds)
            pred_tags = set()
            for tag, array in sent_predictions_by_code.items():
                if self.__is_tag_to_return_(tag):
                    if np.max(array[i]) == 1:
                        pred_tags.add(friendly_tag(tag))

            str_r_tags = self.__get_regular_tags_(pred_tags)
            str_c_tags = self.__get_causal_tags_(pred_tags)

            tagged_sents.append(TaggedSentence(str_sent, str_r_tags, str_c_tags ))
        return tagged_sents

    def __get_essay_tags_(self, sent_predictions_by_code):
        tags = set()

        for tag, array in sent_predictions_by_code.items():
            if np.max(array) == 1:
                tags.add(tag)

        return tags

    def __format_essay_tags_(self, tags):

        tags = map(lambda s: friendly_tag(s), filter(lambda t: self.__is_tag_to_return_(t), tags))

        str_r_tags = self.__get_regular_tags_(tags)
        str_c_tags = self.__get_causal_tags_(tags)

        if not str_r_tags:
            return "", str_c_tags
        elif not str_c_tags:
            return str_r_tags, ""
        else:
            return str_r_tags, str_c_tags

    def __fuzzy_match_(self, original, feat_wd):
        original = original.lower().strip()
        feat_wd = feat_wd.lower().strip()
        if original == feat_wd:
            return True
        if original[:3] == feat_wd[:3]:
            return True
        a = set(original)
        b = set(feat_wd)
        jaccard = float(len(a.intersection(b))) / float(len(a.union(b)))
        return jaccard >= 0.5

    def __align_wd_tags_(self, orig, feats):
        """
        Once processed, there may be a different number of words than in the original sentence
        Try and recover the tags for the original words by aligning the two using simple heuristics
        """
        if len(orig) < len(feats):
            raise Exception("align_wd_tags() : Original sentence is longer!")

        o_wds, _ = zip(*orig)
        feat_wds, new_tags = zip(*feats)

        if len(orig) == len(feats):
            return zip(o_wds, new_tags)

        #here orig is longer than feats
        diff = len(orig) - len(feats)
        tagged_wds = []
        feat_offset = 0
        while len(tagged_wds) < len(o_wds):
            i = len(tagged_wds)
            orig_wd = o_wds[i]
            print(i, orig_wd)

            if i >= len(feats):
                tagged_wds.append((orig_wd, new_tags[-1]))
                continue
            else:
                new_tag_ix = i - feat_offset
                feat_wd = feats[new_tag_ix][0]
                if feat_wd == "INFREQUENT" or feat_wd.isdigit():
                    tagged_wds.append((orig_wd, new_tags[new_tag_ix]))
                    continue

                new_tagged_wds = []
                found = False
                for j in range(i, i + diff + 1):
                    new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix]))
                    next_orig_wd = o_wds[j]
                    if self.__fuzzy_match_(next_orig_wd, feat_wd):
                        found = True
                        tagged_wds.extend(new_tagged_wds)
                        feat_offset += len(new_tagged_wds) - 1
                        break
                if not found:
                    raise Exception("No matching word found for index:%i and processed word:%s" % (i, feat_wd))
        return tagged_wds

    def __get_tagged_words_(self, original_essay, essay_TD, wd_predictions_by_code):
        tagged_sents = []
        # should be a one to one correspondance between words in essays_TD[0] and predictions
        i = 0
        for sent_ix, sent in enumerate(essay_TD.sentences):
            tmp_tagged_wds = []
            for wix, (feat) in enumerate(sent):
                word = feat.word
                tags = set()
                for tag in wd_predictions_by_code.keys():
                    if wd_predictions_by_code[tag][i] > 0:
                        tags.add(tag)
                i += 1
                tmp_tagged_wds.append((word, tags))

            # Now allign the predicted tags with the original words
            wds, aligned_tags = zip(*self.__align_wd_tags_(original_essay.tagged_sentences[sent_ix], tmp_tagged_wds))
            # spelling correct (needs to be after alignment)

            fr_aligned_tags = map(lambda tags: set(map(friendly_tag, tags)), aligned_tags)
            tagged_words = list(zip(wds, fr_aligned_tags))
            tagged_sents.append([TaggedWord(wd, self.spelling_corrector.correct(wd), self.__get_regular_tags_(tags), self.__get_causal_tags_(tags)) for wd, tags in tagged_words])
        return tagged_sents
Пример #7
0
class Annotator(object):
    @classmethod
    def from_config(cls, config_file):
        cfg = Config(config_file)
        return Annotator(cfg.models_folder, cfg.essays_folder,
                         cfg.spell_check_dict)

    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder,
                                          include_vague=cfg["include_vague"],
                                          include_normal=cfg["include_normal"],
                                          load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(
            tagged_essays,
            self.config["lower_case"],
            self.wd_sent_freq,
            folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(
            offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer = store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")

    def annotate(self, essay_text):

        try:
            sentences = sent_tokenize(essay_text.strip())
            contents = "\n".join(sentences)

            essay = Essay(full_path=None,
                          include_vague=self.config["include_vague"],
                          include_normal=self.config["include_normal"],
                          load_annotations=False,
                          essay_text=contents)

            processed_essays = process_essays(
                essays=[essay],
                spelling_corrector=self.spelling_corrector,
                wd_sent_freq=self.wd_sent_freq,
                remove_infrequent=self.config["remove_infrequent"],
                spelling_correct=self.config["spelling_correct"],
                replace_nums=self.config["replace_nums"],
                stem=self.config["stem"],
                remove_stop_words=self.config["remove_stop_words"],
                remove_punctuation=self.config["remove_punctuation"],
                lower_case=self.config["lower_case"])

            self.logger.info("Essay loaded successfully")
            essays_TD = self.feature_extractor.transform(processed_essays)

            wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD)
            xs = self.feature_transformer.transform(wd_feats)

            wd_predictions_by_code = test_classifier_per_code(
                xs, self.tag_2_wd_classifier, self.wd_test_tags)

            dummy_wd_td_ys_bytag = defaultdict(
                lambda: np.asarray([0.0] * xs.shape[0]))
            sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(
                self.sent_input_feat_tags,
                self.sent_input_interaction_tags,
                essays_TD,
                xs,
                dummy_wd_td_ys_bytag,
                self.tag_2_wd_classifier,
                sparse=True,
                look_back=0)
            """ Test Stack Classifier """

            sent_predictions_by_code = test_classifier_per_code(
                sent_xs, self.tag_2_sent_classifier,
                self.sent_output_train_test_tags)
            """ Generate Return Values """
            essay_tags = self.__get_essay_tags_(sent_predictions_by_code)

            essay_type = None
            if "coral" in self.essays_folder.lower():
                essay_type = "CB"
            elif "skin" in self.essays_folder.lower():
                essay_type = "SC"
            else:
                raise Exception("Unknown essay type")

            raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key))

            t_words = self.__get_tagged_words_(essay, essays_TD[0],
                                               wd_predictions_by_code)
            t_sentences = self.__get_tagged_sentences_(
                essay, sent_predictions_by_code)

            tagged_sentences = [
                t_sent.add_word_tags(map(lambda twd: twd.__dict__,
                                         t_wds)).__dict__
                for t_sent, t_wds in zip(t_sentences, t_words)
            ]

            essay_codes, essay_causal = self.__format_essay_tags_(essay_tags)
            return {
                "tagged_sentences": tagged_sentences,
                "essay_codes": essay_codes,
                "essay_causal": essay_causal,
                "essay_category": essay_category(raw_essay_tags, essay_type),
                "raw_essay_tags": raw_essay_tags
            }
        except Exception as x:
            self.logger.exception(
                "An exception occured while annotating essay")
            return {"error": format_exc()}
        pass

    def __set_tags_(self, tagged_essays):

        MIN_TAG_FREQ = 5

        tag_freq = defaultdict(int)
        for essay in tagged_essays:
            for sentence in essay.tagged_sentences:
                un_tags = set()
                for word, tags in sentence:
                    for tag in tags:
                        if "5b" in tag:
                            continue
                        if      (tag[-1].isdigit() or tag in {"Causer", "explicit", "Result"} \
                                    or tag.startswith("Causer") or tag.startswith("Result") \
                                    or tag.startswith("explicit") or "->" in tag) \
                                and not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            # if not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            un_tags.add(tag)
                for tag in un_tags:
                    tag_freq[tag] += 1

        all_tags = list(tag_freq.keys())
        freq_tags = list(
            set((tag for tag, freq in tag_freq.items()
                 if freq >= MIN_TAG_FREQ)))
        non_causal = [t for t in freq_tags if "->" not in t]
        only_causal = [t for t in freq_tags if "->" in t]

        CAUSE_TAGS = ["Causer", "Result", "explicit"]
        CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT,
                           RESULT_REL]  # + ["explicit"]
        """ works best with all the pair-wise causal relation codes """
        # Include all tags for the output
        self.wd_test_tags = list(set(all_tags + CAUSE_TAGS))

        # tags from tagging model used to train the stacked model
        self.sent_input_feat_tags = list(set(freq_tags + CAUSE_TAGS))
        # find interactions between these predicted tags from the word tagger to feed to the sentence tagger
        self.sent_input_interaction_tags = list(set(non_causal + CAUSE_TAGS))
        # tags to train (as output) for the sentence based classifier
        self.sent_output_train_test_tags = list(
            set(all_tags + CAUSE_TAGS + CAUSAL_REL_TAGS))

    def __is_tag_to_return_(self, tag):
        return tag[0].isdigit() or ("->" in tag and "Causer" in tag)

    def __get_regular_tags_(self, pred_tags):
        r_tags = sorted(filter(lambda t: t[0].isdigit() and "->" not in t,
                               pred_tags),
                        key=lambda s: (int(s), s)
                        if s.isdigit() else ((-1, s)))
        str_r_tags = ",".join(r_tags)
        return str_r_tags

    def __get_causal_tags_(self, pred_tags):
        c_tags = sorted(filter(lambda t: "->" in t, pred_tags),
                        key=cr_sort_key)
        str_c_tags = ",".join(c_tags)
        return str_c_tags

    def __get_tagged_sentences_(self, essay, sent_predictions_by_code):
        tagged_sents = []
        for i, sent in enumerate(essay.tagged_sentences):
            wds, _ = zip(*sent)
            str_sent = " ".join(wds)
            pred_tags = set()
            for tag, array in sent_predictions_by_code.items():
                if self.__is_tag_to_return_(tag):
                    if np.max(array[i]) == 1:
                        pred_tags.add(friendly_tag(tag))

            str_r_tags = self.__get_regular_tags_(pred_tags)
            str_c_tags = self.__get_causal_tags_(pred_tags)

            tagged_sents.append(
                TaggedSentence(str_sent, str_r_tags, str_c_tags))
        return tagged_sents

    def __get_essay_tags_(self, sent_predictions_by_code):
        tags = set()

        for tag, array in sent_predictions_by_code.items():
            if np.max(array) == 1:
                tags.add(tag)

        return tags

    def __format_essay_tags_(self, tags):

        tags = map(lambda s: friendly_tag(s),
                   filter(lambda t: self.__is_tag_to_return_(t), tags))

        str_r_tags = self.__get_regular_tags_(tags)
        str_c_tags = self.__get_causal_tags_(tags)

        if not str_r_tags:
            return "", str_c_tags
        elif not str_c_tags:
            return str_r_tags, ""
        else:
            return str_r_tags, str_c_tags

    def __fuzzy_match_(self, original, feat_wd):
        original = original.lower().strip()
        feat_wd = feat_wd.lower().strip()
        if original == feat_wd:
            return True
        if original[:3] == feat_wd[:3]:
            return True
        a = set(original)
        b = set(feat_wd)
        jaccard = float(len(a.intersection(b))) / float(len(a.union(b)))
        return jaccard >= 0.5

    def __align_wd_tags_(self, orig, feats):
        """
        Once processed, there may be a different number of words than in the original sentence
        Try and recover the tags for the original words by aligning the two using simple heuristics
        """
        if len(orig) < len(feats):
            raise Exception("align_wd_tags() : Original sentence is longer!")

        o_wds, _ = zip(*orig)
        feat_wds, new_tags = zip(*feats)

        if len(orig) == len(feats):
            return zip(o_wds, new_tags)

        #here orig is longer than feats
        diff = len(orig) - len(feats)
        tagged_wds = []
        feat_offset = 0
        while len(tagged_wds) < len(o_wds):
            i = len(tagged_wds)
            orig_wd = o_wds[i]
            print(i, orig_wd)

            if i >= len(feats):
                tagged_wds.append((orig_wd, new_tags[-1]))
                continue
            else:
                new_tag_ix = i - feat_offset
                feat_wd = feats[new_tag_ix][0]
                if feat_wd == "INFREQUENT" or feat_wd.isdigit():
                    tagged_wds.append((orig_wd, new_tags[new_tag_ix]))
                    continue

                new_tagged_wds = []
                found = False
                for j in range(i, i + diff + 1):
                    new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix]))
                    next_orig_wd = o_wds[j]
                    if self.__fuzzy_match_(next_orig_wd, feat_wd):
                        found = True
                        tagged_wds.extend(new_tagged_wds)
                        feat_offset += len(new_tagged_wds) - 1
                        break
                if not found:
                    raise Exception(
                        "No matching word found for index:%i and processed word:%s"
                        % (i, feat_wd))
        return tagged_wds

    def __get_tagged_words_(self, original_essay, essay_TD,
                            wd_predictions_by_code):
        tagged_sents = []
        # should be a one to one correspondance between words in essays_TD[0] and predictions
        i = 0
        for sent_ix, sent in enumerate(essay_TD.sentences):
            tmp_tagged_wds = []
            for wix, (feat) in enumerate(sent):
                word = feat.word
                tags = set()
                for tag in wd_predictions_by_code.keys():
                    if wd_predictions_by_code[tag][i] > 0:
                        tags.add(tag)
                i += 1
                tmp_tagged_wds.append((word, tags))

            # Now allign the predicted tags with the original words
            wds, aligned_tags = zip(*self.__align_wd_tags_(
                original_essay.tagged_sentences[sent_ix], tmp_tagged_wds))
            # spelling correct (needs to be after alignment)

            fr_aligned_tags = map(lambda tags: set(map(friendly_tag, tags)),
                                  aligned_tags)
            tagged_words = list(zip(wds, fr_aligned_tags))
            tagged_sents.append([
                TaggedWord(wd, self.spelling_corrector.correct(wd),
                           self.__get_regular_tags_(tags),
                           self.__get_causal_tags_(tags))
                for wd, tags in tagged_words
            ])
        return tagged_sents