def get_local_features(self, text): feature_vector = dict() text_obj = TextObj(text) feature_extractor.get_ngrams(feature_vector, text_obj.tokens) feature_extractor.get_ngrams(feature_vector, text_obj.tokens, n=2) feature_extractor.get_initialisms(feature_vector, text_obj.tokens) feature_extractor.get_basic_lengths(feature_vector, text_obj.text, text_obj.sentences, text_obj.tokens) feature_extractor.get_repeated_punct(feature_vector, text_obj.text) feature_extractor.get_LIWC(feature_vector, text_obj.text) return feature_vector
def build_features(body, subject, stemmer=None, wn=None, features=[]):#'bow_disc']): #'bow_disc', 'liwc' all_bow = dict() body_bow = dict() subj_bow = dict() stemmed_body = stemmer.stem(body) stemmed_subj = stemmer.stem(subject) tokens_body = wordpunct_tokenize(stemmed_body) tokens_subj = wordpunct_tokenize(stemmed_subj) all_bow = {'bow_{}'.format(token): True for token in tokens_body + tokens_subj} if 'bow_desc' in features: body_bow = {'body_{}'.format(token): True for token in tokens_body} subj_bow = {'subject_{}'.format(token): True for token in tokens_subj} polarity = defaultdict(int) subj_liwc = dict() body_liwc = dict() if 'liwc' in features: get_LIWC(subj_liwc, stemmed_subj) get_LIWC(body_liwc, stemmed_body) if 'polarity' in features: for token in tokens_subj + tokens_body: result = msol.lookup(token) if result == 'negative' or result == 'positive': polarity[result] += 1 all_wn = dict() if 'wn' in features: for token in tokens_body + tokens_subj: result = wn.lookup(token) if len(result) > 0: for element in result: all_wn['WN_{}'.format(element)] = True return combine_dicts(all_bow, subj_bow, body_bow, all_wn, prepend_key(subj_liwc, 'SUBJLIWC'), prepend_key(body_liwc, 'BODYLIWC'), polarity, )