def load_process_essays(window_size, min_sentence_length, folder, min_df, remove_infrequent,
                       spelling_correct,
                       replace_nums, stem, remove_stop_words,
                       remove_punctuation, lower_case,
                       include_vague, include_normal):

    essays = load_bratt_essays(directory=folder, include_vague=include_vague, include_normal=include_normal)
    return process_essays(essays, min_df=min_df, remove_infrequent=remove_infrequent, spelling_correct=spelling_correct,
                          replace_nums=replace_nums, stem=stem, remove_stop_words=remove_stop_words,
                          remove_punctuation=remove_punctuation, lower_case=lower_case, spelling_corrector=None)
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder,
                                          include_vague=cfg["include_vague"],
                                          include_normal=cfg["include_normal"],
                                          load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(
            tagged_essays,
            self.config["lower_case"],
            self.wd_sent_freq,
            folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(
            offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer = store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
示例#4
0
def load_essays(include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL):
    return load_bratt_essays(directory=folder,
                             include_vague=include_vague,
                             include_normal=include_normal)
示例#5
0
                      lower_case=True):
    """
    Flattens the processed essays by extracting just the sentences from the esays
    """

    processed_essays = process_essays(essays,
                                      min_df=min_df,
                                      remove_infrequent=remove_infrequent,
                                      spelling_correct=spelling_correct,
                                      replace_nums=replace_nums,
                                      stem=stem,
                                      remove_stop_words=remove_stop_words,
                                      remove_punctuation=remove_punctuation,
                                      lower_case=lower_case)
    sentences = []
    for essay in processed_essays:
        for sentence in essay.sentences:
            sentences.append(sentence)
    return sentences


if __name__ == "__main__":

    from BrattEssay import load_bratt_essays

    essays = load_bratt_essays()
    processed_sentences = process_sentences(essays,
                                            stem=True,
                                            spelling_correct=False)
    pass
def load_essays(include_vague=INCLUDE_VAGUE, include_normal=INCLUDE_NORMAL):
    return load_bratt_essays(directory=folder, include_vague=include_vague, include_normal=include_normal)
                new_sentence.append((cw, tags))
            if len(new_sentence) > 0:
                lst_sentences.append(new_sentence)
    return processed_essays

def process_sentences(essays, min_df=5,
                      remove_infrequent=False, spelling_correct=True,
                      replace_nums=True, stem=False, remove_stop_words=False,
                      remove_punctuation=True, lower_case=True):
    """
    Flattens the processed essays by extracting just the sentences from the esays
    """

    processed_essays = process_essays(essays, min_df=min_df,
                                      remove_infrequent=remove_infrequent, spelling_correct=spelling_correct,
                                      replace_nums=replace_nums, stem=stem, remove_stop_words=remove_stop_words,
                                      remove_punctuation=remove_punctuation, lower_case=lower_case)
    sentences = []
    for essay in processed_essays:
        for sentence in essay.sentences:
            sentences.append(sentence)
    return sentences

if __name__ == "__main__":

    from BrattEssay import load_bratt_essays

    essays = load_bratt_essays()
    processed_sentences = process_sentences(essays, stem=True, spelling_correct=False)
    pass
示例#8
0
None
""" Settings """
""" Start Script """
WINDOW_SIZE = 7  #7 is best
MID_IX = int(round(WINDOW_SIZE / 2.0) - 1)

MIN_SENTENCE_FREQ = 2
PCT_VALIDATION = 0.2
MIN_FEAT_FREQ = 5  #15 best so far
PCT_VALIDATION = 0.25

SENTENCE_START = "<START>"
SENTENCE_END = "<END>"
STEM = True
""" Load Essays """
essays = load_bratt_essays(
    "/Users/simon.hughes/Dropbox/Phd/Data/CoralBleaching/BrattData/Merged/")

all_codes = set()
all_words = []

CAUSAL_REL = "CRel"
RESULT_REL = "RRel"
CAUSE_RESULT = "C->R"

cr_codes = [CAUSAL_REL, RESULT_REL, CAUSE_RESULT]

for essay in essays:
    for sentence in essay.tagged_sentences:
        for w, tags in sentence:
            all_words.append(w)
            all_codes.update(tags)
""" Settings """
""" Start Script """
WINDOW_SIZE = 7 #7 is best
MID_IX = int(round(WINDOW_SIZE / 2.0) - 1)

MIN_SENTENCE_FREQ = 2
PCT_VALIDATION  = 0.2
MIN_FEAT_FREQ = 5     #15 best so far
PCT_VALIDATION = 0.25

SENTENCE_START = "<START>"
SENTENCE_END   = "<END>"
STEM = True

""" Load Essays """
essays = load_bratt_essays("/Users/simon.hughes/Dropbox/Phd/Data/CoralBleaching/BrattData/Merged/")

all_codes = set()
all_words = []

CAUSAL_REL = "CRel"
RESULT_REL = "RRel"
CAUSE_RESULT = "C->R"

cr_codes = [CAUSAL_REL, RESULT_REL, CAUSE_RESULT]

for essay in essays:
    for sentence in essay.tagged_sentences:
        for w, tags in sentence:
            all_words.append(w)
            all_codes.update(tags)