Пример #1
0
def preprocess(sentences):
    # Tokenize sentences
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4")
    model = AutoModel.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4")

    encoded_input = tokenizer(sentences.to_list(), padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    sentiment_train = sentences.apply(lambda x: sentiment(x))
    sentiment_train = pd.DataFrame(sentiment_train.values.tolist(),
                                   columns=['polarity', 'subjectivity'],
                                   index=sentences.index)
    parse_s = sentences.apply(lambda x: parse(x, lemmata=True))
    sent = parse_s.apply(lambda x: Sentence(x))
    modality_s = pd.DataFrame(sent.apply(lambda x: modality(x)))

    meta_df = sentiment_train.merge(modality_s, left_index=True, right_index=True)
    input_matrix = pd.concat([meta_df.reset_index(drop=True), pd.DataFrame(sentence_embeddings)], axis=1)

    return input_matrix
Пример #2
0
def extract_bias_features(text, do_get_caster=False):
    features = OrderedDict()
    if sys.version_info < (3, 0):
        # ignore conversion errors between utf-8 and ascii
        text = text.decode('ascii', 'ignore')
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as separate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    ## SENTENCE LEVEL MEASURES
    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # compound sentiment score using VADER sentiment analysis package
    vader_sentiment = vader_sentiment_analysis.polarity_scores(text)
    vader_negative_proportion = vader_sentiment['neg']
    vader_compound_sentiment = vader_sentiment['compound']
    features['vader_sentiment'] = vader_compound_sentiment
    features['vader_senti_abs'] = abs(vader_compound_sentiment)

    # negative-perspective
    features['neg_persp'] = check_neg_persp(words, vader_negative_proportion,
                                            vader_compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['certainty'] = round(modality(sentence_obj), 4)

    # quoted material
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["quote_length"] = quote_dict["mean_quote_length"]
    features["nonquote_length"] = quote_dict["mean_nonquote_length"]

    ## LEXICON LEVEL MEASURES
    # presupposition markers
    count = count_feature_freq(presup, words, txt_lwr)
    features['presup_cnt'] = count
    features['presup_rto'] = round(old_div(float(count), float(len(words))), 4)

    # doubt markers
    count = count_feature_freq(doubt, words, txt_lwr)
    features['doubt_cnt'] = count
    features['doubt_rto'] = round(old_div(float(count), float(len(words))), 4)

    # partisan words and phrases
    count = count_feature_freq(partisan, words, txt_lwr)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(old_div(float(count), float(len(words))),
                                     4)

    # subjective value laden word count
    count = count_feature_freq(value_laden, words, txt_lwr)
    features['value_cnt'] = count
    features['value_rto'] = round(old_div(float(count), float(len(words))), 4)

    # figurative language markers
    count = count_feature_freq(figurative, words, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # attribution markers
    count = count_feature_freq(attribution, words, txt_lwr)
    features['attribution_cnt'] = count
    features['attribution_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # self reference pronouns
    count = count_feature_freq(self_refer, words, txt_lwr)
    features['self_refer_cnt'] = count
    features['self_refer_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # Contextual Aspect Summary and Topical-Entity Recognition (CASTER)
    if do_get_caster:
        """ May incur a performance cost in time to process """
        caster_dict = get_caster(text)
        features['caster_dict'] = caster_dict

    return features
Пример #3
0
Файл: bias.py Проект: photi/bsd
def extract_bias_features(text):
    features = {}
    text = unicode(text,
                   errors='ignore') if not isinstance(text, unicode) else text
    txt_lwr = str(text).lower()
    words = nltk.word_tokenize(txt_lwr)
    words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$']
    unigrams = sorted(list(set(words)))
    bigram_tokens = nltk.bigrams(words)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = nltk.trigrams(words)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # coherence marker count
    count = count_feature_list_freq(coherence, words, bigrams, trigrams)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # factive verb count
    count = count_feature_list_freq(factives, words, bigrams, trigrams)
    features['factive_cnt'] = count
    features['factive_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # implicative verb count
    count = count_feature_list_freq(implicatives, words, bigrams, trigrams)
    features['implicative_cnt'] = count
    features['implicative_rto'] = round(float(count) / float(len(words)), 4)

    # bias words and phrases count
    count = count_feature_list_freq(biased, words, bigrams, trigrams)
    features['bias_cnt'] = count
    features['bias_rto'] = round(float(count) / float(len(words)), 4)

    # opinion word count
    count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # weak subjective word count
    count = count_feature_list_freq(subj_weak, words, bigrams, trigrams)
    features['subj_weak_cnt'] = count
    features['subj_weak_rto'] = round(float(count) / float(len(words)), 4)

    # strong subjective word count
    count = count_feature_list_freq(subj_strong, words, bigrams, trigrams)
    features['subj_strong_cnt'] = count
    features['subj_strong_rto'] = round(float(count) / float(len(words)), 4)

    # composite sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment

    # subjectivity score using Pattern.en
    pattern_subjectivity = pattern_sentiment(text)[1]
    features['subjectivity'] = round(pattern_subjectivity, 4)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)
    features['mood'] = mood(sentence_obj)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = textstat.flesch_kincaid_grade(text)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc auxiliary verb count
    count = count_liwc_list_freq(liwc_aux, words)
    features['liwc_aux_cnt'] = count
    features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4)

    # liwc adverb count
    count = count_liwc_list_freq(liwc_adv, words)
    features['liwc_adv_cnt'] = count
    features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4)

    # liwc preposition count
    count = count_liwc_list_freq(liwc_prep, words)
    features['liwc_prep_cnt'] = count
    features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4)

    # liwc conjunction count
    count = count_liwc_list_freq(liwc_conj, words)
    features['liwc_conj_cnt'] = count
    features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4)

    # liwc discrepency word count
    count = count_liwc_list_freq(liwc_discr, words)
    features['liwc_discr_cnt'] = count
    features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc certainty word count
    count = count_liwc_list_freq(liwc_cert, words)
    features['liwc_cert_cnt'] = count
    features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    return features
Пример #4
0
def extract_bias_features(text):
    features = OrderedDict()
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as seperate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # presupposition verb count
    count = count_feature_list_freq(presup, words, bigrams, trigrams)
    features['presup_cnt'] = count
    features['presup_rto'] = round(float(count) / float(len(words)), 4)

    # coherence marker count
    count = count_phrase_freq(coherence, txt_lwr)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # partisan words and phrases count
    count = count_feature_list_freq(partisan, words, bigrams, trigrams)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(float(count) / float(len(words)), 4)

    # subjective value laden word count
    count = count_feature_list_freq(value_laden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # compound sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment
    features['vader_senti_abs'] = abs(compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # figurative count
    count = count_phrase_freq(figurative, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(float(count) / float(len(words)), 4)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc self reference promouns count
    count = count_liwc_list_freq(liwc_self, words)
    features['liwc_self_cnt'] = count
    features['liwc_self_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # handle quoted material in text
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["mean_quote_length"] = quote_dict["mean_quote_length"]
    features["mean_nonquote_length"] = quote_dict["mean_nonquote_length"]
    return features
Пример #5
0
def get_modality_mood(text):
    t = parse(text, lemmata=True)
    t = Sentence(t)
    return modality(t), mood(t)
Пример #6
0
def get_modality_by_line(poem):
    return [round(modality(Sentence(parse(line, lemmata=True))), 1) for line in poem]