Exemplo n.º 1
0
Arquivo: bias.py Projeto: photi/bsd
def extract_bias_features(text):
    features = {}
    text = unicode(text,
                   errors='ignore') if not isinstance(text, unicode) else text
    txt_lwr = str(text).lower()
    words = nltk.word_tokenize(txt_lwr)
    words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$']
    unigrams = sorted(list(set(words)))
    bigram_tokens = nltk.bigrams(words)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = nltk.trigrams(words)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # coherence marker count
    count = count_feature_list_freq(coherence, words, bigrams, trigrams)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # factive verb count
    count = count_feature_list_freq(factives, words, bigrams, trigrams)
    features['factive_cnt'] = count
    features['factive_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # implicative verb count
    count = count_feature_list_freq(implicatives, words, bigrams, trigrams)
    features['implicative_cnt'] = count
    features['implicative_rto'] = round(float(count) / float(len(words)), 4)

    # bias words and phrases count
    count = count_feature_list_freq(biased, words, bigrams, trigrams)
    features['bias_cnt'] = count
    features['bias_rto'] = round(float(count) / float(len(words)), 4)

    # opinion word count
    count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # weak subjective word count
    count = count_feature_list_freq(subj_weak, words, bigrams, trigrams)
    features['subj_weak_cnt'] = count
    features['subj_weak_rto'] = round(float(count) / float(len(words)), 4)

    # strong subjective word count
    count = count_feature_list_freq(subj_strong, words, bigrams, trigrams)
    features['subj_strong_cnt'] = count
    features['subj_strong_rto'] = round(float(count) / float(len(words)), 4)

    # composite sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment

    # subjectivity score using Pattern.en
    pattern_subjectivity = pattern_sentiment(text)[1]
    features['subjectivity'] = round(pattern_subjectivity, 4)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)
    features['mood'] = mood(sentence_obj)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = textstat.flesch_kincaid_grade(text)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc auxiliary verb count
    count = count_liwc_list_freq(liwc_aux, words)
    features['liwc_aux_cnt'] = count
    features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4)

    # liwc adverb count
    count = count_liwc_list_freq(liwc_adv, words)
    features['liwc_adv_cnt'] = count
    features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4)

    # liwc preposition count
    count = count_liwc_list_freq(liwc_prep, words)
    features['liwc_prep_cnt'] = count
    features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4)

    # liwc conjunction count
    count = count_liwc_list_freq(liwc_conj, words)
    features['liwc_conj_cnt'] = count
    features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4)

    # liwc discrepency word count
    count = count_liwc_list_freq(liwc_discr, words)
    features['liwc_discr_cnt'] = count
    features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc certainty word count
    count = count_liwc_list_freq(liwc_cert, words)
    features['liwc_cert_cnt'] = count
    features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    return features
Exemplo n.º 2
0
def extract_bias_features(text, do_get_caster=False):
    features = OrderedDict()
    if sys.version_info < (3, 0):
        # ignore conversion errors between utf-8 and ascii
        text = text.decode('ascii', 'ignore')
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as separate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    ## SENTENCE LEVEL MEASURES
    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # compound sentiment score using VADER sentiment analysis package
    vader_sentiment = vader_sentiment_analysis.polarity_scores(text)
    vader_negative_proportion = vader_sentiment['neg']
    vader_compound_sentiment = vader_sentiment['compound']
    features['vader_sentiment'] = vader_compound_sentiment
    features['vader_senti_abs'] = abs(vader_compound_sentiment)

    # negative-perspective
    features['neg_persp'] = check_neg_persp(words, vader_negative_proportion,
                                            vader_compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['certainty'] = round(modality(sentence_obj), 4)

    # quoted material
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["quote_length"] = quote_dict["mean_quote_length"]
    features["nonquote_length"] = quote_dict["mean_nonquote_length"]

    ## LEXICON LEVEL MEASURES
    # presupposition markers
    count = count_feature_freq(presup, words, txt_lwr)
    features['presup_cnt'] = count
    features['presup_rto'] = round(old_div(float(count), float(len(words))), 4)

    # doubt markers
    count = count_feature_freq(doubt, words, txt_lwr)
    features['doubt_cnt'] = count
    features['doubt_rto'] = round(old_div(float(count), float(len(words))), 4)

    # partisan words and phrases
    count = count_feature_freq(partisan, words, txt_lwr)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(old_div(float(count), float(len(words))),
                                     4)

    # subjective value laden word count
    count = count_feature_freq(value_laden, words, txt_lwr)
    features['value_cnt'] = count
    features['value_rto'] = round(old_div(float(count), float(len(words))), 4)

    # figurative language markers
    count = count_feature_freq(figurative, words, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # attribution markers
    count = count_feature_freq(attribution, words, txt_lwr)
    features['attribution_cnt'] = count
    features['attribution_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # self reference pronouns
    count = count_feature_freq(self_refer, words, txt_lwr)
    features['self_refer_cnt'] = count
    features['self_refer_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # Contextual Aspect Summary and Topical-Entity Recognition (CASTER)
    if do_get_caster:
        """ May incur a performance cost in time to process """
        caster_dict = get_caster(text)
        features['caster_dict'] = caster_dict

    return features
Exemplo n.º 3
0
def extract_bias_features(text):
    features = OrderedDict()
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as seperate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # presupposition verb count
    count = count_feature_list_freq(presup, words, bigrams, trigrams)
    features['presup_cnt'] = count
    features['presup_rto'] = round(float(count) / float(len(words)), 4)

    # coherence marker count
    count = count_phrase_freq(coherence, txt_lwr)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # partisan words and phrases count
    count = count_feature_list_freq(partisan, words, bigrams, trigrams)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(float(count) / float(len(words)), 4)

    # subjective value laden word count
    count = count_feature_list_freq(value_laden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # compound sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment
    features['vader_senti_abs'] = abs(compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # figurative count
    count = count_phrase_freq(figurative, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(float(count) / float(len(words)), 4)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc self reference promouns count
    count = count_liwc_list_freq(liwc_self, words)
    features['liwc_self_cnt'] = count
    features['liwc_self_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # handle quoted material in text
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["mean_quote_length"] = quote_dict["mean_quote_length"]
    features["mean_nonquote_length"] = quote_dict["mean_nonquote_length"]
    return features