def __init__(self): self.Functions = Functions() self.seq = ( 'HarmVirtue, HarmVice, ' + 'FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, ' + 'AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, ' + 'MoralityGeneral') self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq
class bias_vectorizer( TransformerMixin ): # any custom transformer needs to inherit sklearn transformMixin or any python class that implements .fit method def __init__(self): self.Functions = Functions() self.seq = ( 'bias_count, assertives_count, ' + 'factives_count, hedges_count, implicatives_count, ' + 'report_verbs_count, positive_op_count, negative_op_count, ' + 'wneg_count, wpos_count, wneu_count, sneg_count, ' + 'spos_count, sneu_count, NB_pobj, NB_psubj') self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq def transform(self, X): vects = [] for article in X: #article = Functions.fix(' '.join([L for L in article.split('\n') if L.strip() != ''])) if len(article.strip()) == 0: # if text is not available, generate a set of zeros seq = ['0'] * self.num_features else: NB_pobj, NB_psubj = self.Functions.subjectivity(article) bias_count, assertives_count, factives_count, hedges_count, \ implicatives_count, report_verbs_count, positive_op_count, \ negative_op_count, wneg_count, wpos_count, wneu_count, \ sneg_count, spos_count, \ sneu_count = self.Functions.bias_lexicon_feats(article) seq = [ bias_count, assertives_count, factives_count, hedges_count, implicatives_count, report_verbs_count, positive_op_count, negative_op_count, wneg_count, wpos_count, wneu_count, sneg_count, spos_count, sneu_count, NB_pobj, NB_psubj ] vects.append(seq) matrix = np.array(vects).reshape(len(X), len(seq)) return matrix def fit(self): return self def fit_transform(self, X): self.fit() return self.transform(X) def get_feature_names(self): return self.feature_names def make_str(seq): return [str(s) for s in seq]
def __init__(self): self.Functions = Functions() self.seq = ( 'bias_count, assertives_count, ' + 'factives_count, hedges_count, implicatives_count, ' + 'report_verbs_count, positive_op_count, negative_op_count, ' + 'wneg_count, wpos_count, wneu_count, sneg_count, ' + 'spos_count, sneu_count, NB_pobj, NB_psubj') self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq
class morality_vectorizer( TransformerMixin ): # any custom transformer needs to inherit sklearn transformMixin or any python class that implements .fit method def __init__(self): self.Functions = Functions() self.seq = ( 'HarmVirtue, HarmVice, ' + 'FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, ' + 'AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, ' + 'MoralityGeneral') self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq def transform(self, X): vects = [] for article in X: #article = Functions.fix(' '.join([L for L in article.split('\n') if L.strip() != ''])) if len(article.strip()) == 0: # if text is not available, generate a set of zeros seq = ['0'] * self.num_features else: HarmVirtue, HarmVice, FairnessVirtue, FairnessVice, \ IngroupVirtue, IngroupVice, AuthorityVirtue, \ AuthorityVice, PurityVirtue, PurityVice, \ MoralityGeneral = self.Functions.moral_foundation_feats(article) seq = [ HarmVirtue, HarmVice, FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, MoralityGeneral ] vects.append(seq) matrix = np.array(vects).reshape(len(X), len(seq)) return matrix def fit(self): return self def fit_transform(self, X): self.fit() return self.transform(X) def get_feature_names(self): return self.feature_names def make_str(seq): return [str(s) for s in seq]
def __init__(self): self. Functions = Functions() self.cat_dict, self.stem_dict, self.counts_dict = self.Functions.load_LIWC_dictionaries('feature_extraction/resources/') self.liwc_cats = [self.cat_dict[cat] for cat in self.cat_dict] self.pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'WP$', 'WRB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP'] self.seq = ('Happiness, HarmVirtue, HarmVice, ' + 'FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, ' + 'AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, ' + 'MoralityGeneral, bias_count, assertives_count, ' + 'factives_count, hedges_count, implicatives_count, ' + 'report_verbs_count, positive_op_count, negative_op_count, ' + 'wneg_count, wpos_count, wneu_count, sneg_count, ' + 'spos_count, sneu_count, TTR, vad_neg, vad_neu, vad_pos, FKE,' + 'SMOG, stop, wordlen,WC, NB_pobj, NB_psubj, quotes, Exclaim,' + 'AllPunc, allcaps', ','.join(self.pos_tags), ','.join(self.liwc_cats)) self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq
def whatsbeendon(filename): pids = [] try: with open(filename) as data: pids = [line.strip().split(',')[0] for line in data] return set(pids) except: return set(pids) def make_str(seq): return [str(s) for s in seq] Functions = Functions() outfile = 'features-title.csv' if TEXT_TYPE == 'title' else 'features-body.csv' outpath = "./" done = whatsbeendon(outfile) articlesDir = 'articles/mbfc/' sources_dictionary = {} with open(articlesDir + 'mappings.txt') as f: for line in f.readlines(): array = line.strip().split('\t') sources_dictionary.update({array[0]: (array[1], array[2])}) cat_dict, stem_dict, counts_dict = Functions.load_LIWC_dictionaries() liwc_cats = [cat_dict[cat] for cat in cat_dict] pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB',
class nela_vectorizer(TransformerMixin): # any custom transformer needs to inherit sklearn transformMixin or any python class that implements .fit method def __init__(self): self. Functions = Functions() self.cat_dict, self.stem_dict, self.counts_dict = self.Functions.load_LIWC_dictionaries('feature_extraction/resources/') self.liwc_cats = [self.cat_dict[cat] for cat in self.cat_dict] self.pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'WP$', 'WRB', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP'] self.seq = ('Happiness, HarmVirtue, HarmVice, ' + 'FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, ' + 'AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, ' + 'MoralityGeneral, bias_count, assertives_count, ' + 'factives_count, hedges_count, implicatives_count, ' + 'report_verbs_count, positive_op_count, negative_op_count, ' + 'wneg_count, wpos_count, wneu_count, sneg_count, ' + 'spos_count, sneu_count, TTR, vad_neg, vad_neu, vad_pos, FKE,' + 'SMOG, stop, wordlen,WC, NB_pobj, NB_psubj, quotes, Exclaim,' + 'AllPunc, allcaps', ','.join(self.pos_tags), ','.join(self.liwc_cats)) self.num_features = sum([len(s.split(',')) for s in self.seq]) self.feature_names = self.seq def transform(self,X): vects = [] for article in X: #article = Functions.fix(' '.join([L for L in article.split('\n') if L.strip() != ''])) if len(article.strip()) == 0: # if text is not available, generate a set of zeros seq = ['0'] * self.num_features else: pos_features_path = 'feature_extraction/temp/' quotes, Exclaim, AllPunc, allcaps = self.Functions.stuff_LIWC_leftout('ERROR',article) lex_div = float(self.Functions.ttr(article)) counts_norm = self.Functions.POS_features('input', article, pos_features_path) #counts_norm = [str(c) for c in counts_norm] counts_norm_liwc, liwc_cats = self.Functions.LIWC(article, self.cat_dict, self.stem_dict, self.counts_dict) #counts_norm_liwc = [str(c) for c in counts_norm_liwc] vadneg, vadneu, vadpos = self.Functions.vadersent(article) fke, SMOG = self.Functions.readability(article) stop, wordlen, WC = self.Functions.wordlen_and_stop(article) NB_pobj, NB_psubj = self.Functions.subjectivity(article) bias_count, assertives_count, factives_count, hedges_count, \ implicatives_count, report_verbs_count, positive_op_count, \ negative_op_count, wneg_count, wpos_count, wneu_count, \ sneg_count, spos_count, \ sneu_count = self.Functions.bias_lexicon_feats(article) HarmVirtue, HarmVice, FairnessVirtue, FairnessVice, \ IngroupVirtue, IngroupVice, AuthorityVirtue, \ AuthorityVice, PurityVirtue, PurityVice, \ MoralityGeneral = self.Functions.moral_foundation_feats(article) happiness = float(self.Functions.happiness_index_feats(article)) seq = [happiness, HarmVirtue, HarmVice, FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, MoralityGeneral, bias_count, assertives_count, factives_count, hedges_count, implicatives_count, report_verbs_count, positive_op_count, negative_op_count, wneg_count, wpos_count, wneu_count, sneg_count, spos_count, sneu_count, lex_div, vadneg, vadneu, vadpos, fke, SMOG, stop, wordlen, WC, NB_pobj, NB_psubj, quotes, Exclaim, AllPunc, allcaps]+ counts_norm+counts_norm_liwc # ','.join(counts_norm), # ','.join(counts_norm_liwc)] vects.append(seq) matrix = np.array(vects).reshape(len(X),len(seq)) return matrix def fit(self): return self def fit_transform(self,X): self.fit() return self.transform(X) def get_feature_names(self): return self.feature_names def make_str(seq): return [str(s) for s in seq]