def __init__(self):
     self.grammar = GrammarCorrector()
     self.spelling = SpellCorrector()
     self.stem = stem
Exemplo n.º 2
0
 def __init__(self):
     self.grammar = GrammarCorrector()
     self.spelling = SpellCorrector()
     self.stem = stem
class MetaFeatureGenerator(object):
    speech_parts = dict(
        nouns=["NN", "NNP", "NNPS", "NNS"],
        verbs=["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
        adjectives=["JJ", "JJR", "JJS"],
        adverbs=["RB", "RBR", "RBS"]
    )

    def __init__(self):
        self.grammar = GrammarCorrector()
        self.spelling = SpellCorrector()
        self.stem = stem

    def generate_grammar_features(self, raw_grammar):
        grammar_counts = {}
        for k in self.speech_parts:
            grammar_counts[k] = 0

        for t in raw_grammar:
            for k in self.speech_parts:
                if t[1] in self.speech_parts[k]:
                    grammar_counts[k] += 1
        for k in grammar_counts:
            grammar_counts[k] /= (len(raw_grammar)+1)
        return grammar_counts

    def clean_spell_corrected_tags(self, spelling_markup):
        return re.sub("<[^>]+>", '', spelling_markup)

    def generate_text_features(self, text):
        feats = {}
        feats['length'] = len(text)
        feats['word_length'] = len(text.split())
        feats['sentence_length'] = len(text.split("."))
        feats['chars_per_sentence'] = feats['length'] / (feats['sentence_length'] + 1)
        feats['words_per_sentence'] = feats['word_length'] / (feats['sentence_length'] + 1)
        feats['chars_per_word'] = feats['length'] / (feats['word_length'] + 1)
        return feats

    def generate_clean_stem_text(self, text):
        spelling_errors, spelling_markup, raw_spelling = self.spelling.correct_string(text)
        clean_text = self.clean_spell_corrected_tags(spelling_markup)
        clean_text = re.sub("[^A-Za-z0-9 \.,\'\":;]", " ", clean_text.lower())
        clean_text = re.sub("\s+", " ", clean_text)
        clean_text = ' '.join([self.stem(t) for t in clean_text.split(' ')])
        return clean_text

    def generate_meta_features(self, text):
        features = {}
        grammar_errors, grammar_markup, raw_grammar = self.grammar.correct_string(text)
        features['grammar_errors'] = grammar_errors
        grammar_feats = self.generate_grammar_features(raw_grammar)
        features.update(grammar_feats)

        spelling_errors, spelling_markup, raw_spelling = self.spelling.correct_string(text)
        features['spelling_errors'] = spelling_errors

        text_feats = self.generate_text_features(text)
        features.update(text_feats)

        return features
Exemplo n.º 4
0
class MetaFeatureGenerator(object):
    speech_parts = dict(nouns=["NN", "NNP", "NNPS", "NNS"],
                        verbs=["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
                        adjectives=["JJ", "JJR", "JJS"],
                        adverbs=["RB", "RBR", "RBS"])

    def __init__(self):
        self.grammar = GrammarCorrector()
        self.spelling = SpellCorrector()
        self.stem = stem

    def generate_grammar_features(self, raw_grammar):
        grammar_counts = {}
        for k in self.speech_parts:
            grammar_counts[k] = 0

        for t in raw_grammar:
            for k in self.speech_parts:
                if t[1] in self.speech_parts[k]:
                    grammar_counts[k] += 1
        for k in grammar_counts:
            grammar_counts[k] /= len(raw_grammar)
        return grammar_counts

    def clean_spell_corrected_tags(self, spelling_markup):
        return re.sub("<[^>]+>", '', spelling_markup)

    def generate_text_features(self, text):
        feats = {}
        feats['length'] = len(text)
        feats['word_length'] = len(text.split())
        feats['sentence_length'] = len(text.split("."))
        feats['chars_per_sentence'] = feats['length'] / (
            feats['sentence_length'] + 1)
        feats['words_per_sentence'] = feats['word_length'] / (
            feats['sentence_length'] + 1)
        feats['chars_per_word'] = feats['length'] / (feats['word_length'] + 1)
        return feats

    def generate_clean_stem_text(self, text):
        spelling_errors, spelling_markup, raw_spelling = self.spelling.correct_string(
            text)
        clean_text = self.clean_spell_corrected_tags(spelling_markup)
        clean_text = re.sub("[^A-Za-z0-9 \.,\'\":;]", " ", clean_text.lower())
        clean_text = re.sub("\s+", " ", clean_text)
        clean_text = ' '.join([self.stem(t) for t in clean_text.split(' ')])
        return clean_text

    def generate_meta_features(self, text):
        features = {}
        grammar_errors, grammar_markup, raw_grammar = self.grammar.correct_string(
            text)
        features['grammar_errors'] = grammar_errors
        grammar_feats = self.generate_grammar_features(raw_grammar)
        features.update(grammar_feats)

        spelling_errors, spelling_markup, raw_spelling = self.spelling.correct_string(
            text)
        features['spelling_errors'] = spelling_errors

        text_feats = self.generate_text_features(text)
        features.update(text_feats)

        return features