예제 #1
0
def preprocess(sentences):
    # Tokenize sentences
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4")
    model = AutoModel.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4")

    encoded_input = tokenizer(sentences.to_list(), padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    sentiment_train = sentences.apply(lambda x: sentiment(x))
    sentiment_train = pd.DataFrame(sentiment_train.values.tolist(),
                                   columns=['polarity', 'subjectivity'],
                                   index=sentences.index)
    parse_s = sentences.apply(lambda x: parse(x, lemmata=True))
    sent = parse_s.apply(lambda x: Sentence(x))
    modality_s = pd.DataFrame(sent.apply(lambda x: modality(x)))

    meta_df = sentiment_train.merge(modality_s, left_index=True, right_index=True)
    input_matrix = pd.concat([meta_df.reset_index(drop=True), pd.DataFrame(sentence_embeddings)], axis=1)

    return input_matrix
예제 #2
0
def get_word2vec_vocab():
    """
    Gets the vocabulary from pretrained word2vec and writes a new file with all the vocabulary in a new file
    The file contains a single word per line. Over 2 billion words
    :return: Nothing
    """
    print("Extracting word2vec vocabulary")
    model = KeyedVectors.load_word2vec_format(
        "data/GoogleNews-vectors-negative300.bin", binary=True)
    vocab = model.wv.vocab
    w = open("extracted/lists/GoogleNews-vectors-negative300", "w+")
    with tqdm(total=len(vocab)) as pbar:
        for v in vocab:
            if len(parse(v + "\n")) > 0:
                if parse(v + "\n").split("/")[1] == "NNP" and " ".join(
                        v.lower().split("_")).title() == " ".join(
                            v.split("_")):
                    w.write(v + "\n")
            pbar.update(1)
    w.close()
예제 #3
0
def get_word2vec_vocab():
    model = KeyedVectors.load_word2vec_format(
        "data/GoogleNews-vectors-negative300.bin", binary=True)
    vocab = model.wv.vocab
    w = open("extracted/lists/GoogleNews-vectors-negative300_NNP", "w+")
    with tqdm(total=len(vocab)) as pbar:
        for v in vocab:
            if parse(v + "\n").split("/")[1] == "NNP" and " ".join(
                    v.lower().split("_").capitalize()) == " ".join(
                        v.split("_")):
                w.write(v + "\n")
            pbar.update(1)
    w.close()
예제 #4
0
def lemmatize_sentence(content,
                       allowed_tags=nltk.re.compile(''),
                       stopwords=frozenset(),
                       min_length=1,
                       max_length=100):
    """
    This function is only available when the optional 'pattern' package is installed.

    Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
    their base form=lemma, e.g. "are, is, being" -> "be" etc.
    This is a smarter version of stemming, taking word context into account.

    Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).

    >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
    ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

    >>> lemmatize('The study ranks high.')
    ['study/NN', 'rank/VB', 'high/JJ']

    >>> lemmatize('The ranks study hard.')
    ['rank/NN', 'study/VB', 'hard/RB']

    """

    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
    # FIXME this throws away all fancy parsing cues, including sentence structure,
    # abbreviations etc.
    content = gensim.utils.u(' ').join(
        gensim.utils.tokenize(content, lower=True, errors='ignore'))

    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if min_length <= len(lemma) <= max_length and not lemma.startswith(
                    '_') and lemma not in stopwords:
                if allowed_tags.match(tag):
                    result.append((token, tag, lemma))
    return result
예제 #5
0
def lemmatize_sentence(content, allowed_tags=nltk.re.compile(''),
               stopwords=frozenset(), min_length=1, max_length=100):


    """
    This function is only available when the optional 'pattern' package is installed.

    Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
    their base form=lemma, e.g. "are, is, being" -> "be" etc.
    This is a smarter version of stemming, taking word context into account.

    Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).

    >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
    ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

    >>> lemmatize('The study ranks high.')
    ['study/NN', 'rank/VB', 'high/JJ']

    >>> lemmatize('The ranks study hard.')
    ['rank/NN', 'study/VB', 'hard/RB']

    """

    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
    # FIXME this throws away all fancy parsing cues, including sentence structure,
    # abbreviations etc.
    content = gensim.utils.u(' ').join(
        gensim.utils.tokenize(content, lower=True, errors='ignore'))

    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if min_length <= len(lemma) <= max_length and not lemma.startswith(
                    '_') and lemma not in stopwords:
                if allowed_tags.match(tag):
                    result.append((token, tag, lemma))
    return result
예제 #6
0
def collect_sentence_data(sent):
    #######################################################################################################
    # was going to use this NER code to detect gendered referents but the false positive rate was insane! #
    # for instance, it inferred all instances of 'Cars' as well as'Great Britain' to be people!?!?!?!?!?! #
    #######################################################################################################
    #people = []
    #for chunk in nltk.ne_chunk(sent.tags,binary=False):
    #    if hasattr(chunk, '_label'):
    #        person = str()
    #        if chunk._label == 'PERSON':
    #            print("found person in '" + str(chunk))
    #            for word in chunk:
    #                person += ' ' + word[0].lower()

    p = parse(sent.string)
    inNP = False
    words = p.split()
    #list of noun phrases. each phrase is a tuple of a 0 or 1 to indicate if the current phrase is in a prepositional phrase and a noun list and each noun is a tuple of noun and 'sing' or 'plur'
    nphrase_list = []
    noun_list = []
    nphrase = ()
    length = len(words[0])
    i = 0
    for word in words[0]:
        if 'NP' in word[2]:
            if not inNP:
                #this indicates we're in the first word in a noun phrase
                inNP = True
                if 'PNP' in word[3]:
                    #this indicates we're in a prep phrase
                    nphrase += 1,
                else:
                    nphrase += 0,
            if 'NN' in word[1]:
                #current word is noun
                if 'S' in word[1]:
                    noun = (word[0], 'Plur', i)
                    noun_list.append(noun)
                else:
                    noun = (word[0], 'Sing', i)
                    noun_list.append(noun)
            elif 'PRP' in word[1]:
                #current word is pronoun
                if 'S' in word[1]:
                    noun = (word[0], 'Plur', i)
                    noun_list.append(noun)
                else:
                    noun = (word[0], 'Sing', i)
                    noun_list.append(noun)
            if i == length - 1:
                nphrase += noun_list,
                nphrase_list.append(nphrase)
                inNP = False
                nphrase = ()
                noun_list = []
        elif inNP:
            nphrase += noun_list,
            nphrase_list.append(nphrase)
            inNP = False
            nphrase = ()
            noun_list = []
        i += 1
    return nphrase_list
예제 #7
0
            for v in vocab:
                w.write(v + "\n")
                pbar.update(1)
        w.close()

        w = open("extracted/lists/vocab_word2vec_POS.txt", "w+")
        with tqdm(total=len(vocab)) as pbar:
            for v in vocab:
                w.write(tag(v) + "\n")
                pbar.update(1)
        w.close()

        w = open("extracted/lists/" + checkpoint + "_parse.txt", "w+")
        with tqdm(total=len(vocab)) as pbar:
            for v in vocab:
                w.write(parse(str(v) + "\n") + "\n")
                pbar.update(1)
        w.close()

        # These names are in game_names, obtained in scrape.py from taking words before the pattern "is a * game"
        # The list is small and noisy. We will take a single popular game,
        # top100 = my_model.model.most_similar('Borderlands', topn=100)
        tops_adj = {
            'Borderlands': None,
            'facets': None,
            'adjective': None,
            'type': None,
            'multiplayer': None,
            'characteristics': None,
            'gameplay':
            None,  # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
예제 #8
0
파일: bias.py 프로젝트: yaoyang33/bsd
def extract_bias_features(text, do_get_caster=False):
    features = OrderedDict()
    if sys.version_info < (3, 0):
        # ignore conversion errors between utf-8 and ascii
        text = text.decode('ascii', 'ignore')
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as separate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    ## SENTENCE LEVEL MEASURES
    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # compound sentiment score using VADER sentiment analysis package
    vader_sentiment = vader_sentiment_analysis.polarity_scores(text)
    vader_negative_proportion = vader_sentiment['neg']
    vader_compound_sentiment = vader_sentiment['compound']
    features['vader_sentiment'] = vader_compound_sentiment
    features['vader_senti_abs'] = abs(vader_compound_sentiment)

    # negative-perspective
    features['neg_persp'] = check_neg_persp(words, vader_negative_proportion,
                                            vader_compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['certainty'] = round(modality(sentence_obj), 4)

    # quoted material
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["quote_length"] = quote_dict["mean_quote_length"]
    features["nonquote_length"] = quote_dict["mean_nonquote_length"]

    ## LEXICON LEVEL MEASURES
    # presupposition markers
    count = count_feature_freq(presup, words, txt_lwr)
    features['presup_cnt'] = count
    features['presup_rto'] = round(old_div(float(count), float(len(words))), 4)

    # doubt markers
    count = count_feature_freq(doubt, words, txt_lwr)
    features['doubt_cnt'] = count
    features['doubt_rto'] = round(old_div(float(count), float(len(words))), 4)

    # partisan words and phrases
    count = count_feature_freq(partisan, words, txt_lwr)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(old_div(float(count), float(len(words))),
                                     4)

    # subjective value laden word count
    count = count_feature_freq(value_laden, words, txt_lwr)
    features['value_cnt'] = count
    features['value_rto'] = round(old_div(float(count), float(len(words))), 4)

    # figurative language markers
    count = count_feature_freq(figurative, words, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # attribution markers
    count = count_feature_freq(attribution, words, txt_lwr)
    features['attribution_cnt'] = count
    features['attribution_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # self reference pronouns
    count = count_feature_freq(self_refer, words, txt_lwr)
    features['self_refer_cnt'] = count
    features['self_refer_rto'] = round(
        old_div(float(count), float(len(words))), 4)

    # Contextual Aspect Summary and Topical-Entity Recognition (CASTER)
    if do_get_caster:
        """ May incur a performance cost in time to process """
        caster_dict = get_caster(text)
        features['caster_dict'] = caster_dict

    return features
예제 #9
0
    stemmed_question2 = stemmed_questions_pairs[i][1]

    question1.set_lemmatized_question_tokens(lemmatized_question1)
    question2.set_lemmatized_question_tokens(lemmatized_question2)

    question1.set_stemmed_question_tokens(stemmed_question1)
    question2.set_stemmed_question_tokens(stemmed_question2)

    question1.set_question_tokens(tokenized_question1)
    question2.set_question_tokens(tokenized_question2)

    question1_parsetree = parsetree(question1_string, relations=True)
    question2_parsetree = parsetree(question2_string, relations=True)

    if len(question1_parsetree) == 1:
        parse_string = str(parse(question1_string, relations=True))
        question1.set_parse_tree_string(parse_string)
        tokens_tags, tokens_roles, tokens_chunks = {}, {}, {}

        for sentence in question1_parsetree:
            for chunk in sentence.chunks:
                for word in chunk.words:
                    #print(chunk.role)
                    tokens_tags[(int(word.index), str(word.string))] = word.tag
                    tokens_chunks[(int(word.index),
                                   str(word.string))] = word.chunk
                    tokens_roles[(int(word.index),
                                  str(word.string))] = chunk.role

        question1.set_tokens_tags(tokens_tags)
        question1.set_tokens_chunks(tokens_chunks)
예제 #10
0
파일: bias.py 프로젝트: photi/bsd
def extract_bias_features(text):
    features = {}
    text = unicode(text,
                   errors='ignore') if not isinstance(text, unicode) else text
    txt_lwr = str(text).lower()
    words = nltk.word_tokenize(txt_lwr)
    words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$']
    unigrams = sorted(list(set(words)))
    bigram_tokens = nltk.bigrams(words)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = nltk.trigrams(words)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # coherence marker count
    count = count_feature_list_freq(coherence, words, bigrams, trigrams)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # factive verb count
    count = count_feature_list_freq(factives, words, bigrams, trigrams)
    features['factive_cnt'] = count
    features['factive_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # implicative verb count
    count = count_feature_list_freq(implicatives, words, bigrams, trigrams)
    features['implicative_cnt'] = count
    features['implicative_rto'] = round(float(count) / float(len(words)), 4)

    # bias words and phrases count
    count = count_feature_list_freq(biased, words, bigrams, trigrams)
    features['bias_cnt'] = count
    features['bias_rto'] = round(float(count) / float(len(words)), 4)

    # opinion word count
    count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # weak subjective word count
    count = count_feature_list_freq(subj_weak, words, bigrams, trigrams)
    features['subj_weak_cnt'] = count
    features['subj_weak_rto'] = round(float(count) / float(len(words)), 4)

    # strong subjective word count
    count = count_feature_list_freq(subj_strong, words, bigrams, trigrams)
    features['subj_strong_cnt'] = count
    features['subj_strong_rto'] = round(float(count) / float(len(words)), 4)

    # composite sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment

    # subjectivity score using Pattern.en
    pattern_subjectivity = pattern_sentiment(text)[1]
    features['subjectivity'] = round(pattern_subjectivity, 4)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)
    features['mood'] = mood(sentence_obj)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = textstat.flesch_kincaid_grade(text)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc auxiliary verb count
    count = count_liwc_list_freq(liwc_aux, words)
    features['liwc_aux_cnt'] = count
    features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4)

    # liwc adverb count
    count = count_liwc_list_freq(liwc_adv, words)
    features['liwc_adv_cnt'] = count
    features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4)

    # liwc preposition count
    count = count_liwc_list_freq(liwc_prep, words)
    features['liwc_prep_cnt'] = count
    features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4)

    # liwc conjunction count
    count = count_liwc_list_freq(liwc_conj, words)
    features['liwc_conj_cnt'] = count
    features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4)

    # liwc discrepency word count
    count = count_liwc_list_freq(liwc_discr, words)
    features['liwc_discr_cnt'] = count
    features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc certainty word count
    count = count_liwc_list_freq(liwc_cert, words)
    features['liwc_cert_cnt'] = count
    features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    return features
예제 #11
0
def collect_sentence_data(sent):
	#######################################################################################################
	# was going to use this NER code to detect gendered referents but the false positive rate was insane! #
	# for instance, it inferred all instances of 'Cars' as well as'Great Britain' to be people!?!?!?!?!?! #
	#######################################################################################################
	#people = []
	#for chunk in nltk.ne_chunk(sent.tags,binary=False):                  
	#    if hasattr(chunk, '_label'):
	#        person = str()
	#        if chunk._label == 'PERSON':
	#            print("found person in '" + str(chunk))
	#            for word in chunk:
	#                person += ' ' + word[0].lower()

	p = parse(sent.string)
	inNP = False
	words = p.split()
	#list of noun phrases. each phrase is a tuple of a 0 or 1 to indicate if the current phrase is in a prepositional phrase and a noun list and each noun is a tuple of noun and 'sing' or 'plur' 
	nphrase_list = []
	noun_list = []
	nphrase = ()
	length =  len(words[0])
	i = 0
	for word in words[0]:
		if 'NP' in word[2]:                        
			if not inNP:
				#this indicates we're in the first word in a noun phrase
				inNP = True
				if 'PNP' in word[3]:
					#this indicates we're in a prep phrase
					nphrase += 1,
				else:
					nphrase += 0,
			if 'NN' in word[1]:
				#current word is noun
				if 'S' in word[1]:
					noun = (word[0], 'Plur', i)
					noun_list.append(noun)
				else:
					noun = (word[0], 'Sing', i)
					noun_list.append(noun)
			elif 'PRP' in word[1]:
				#current word is pronoun
				if 'S' in word[1]:    
					noun = (word[0], 'Plur' , i)
					noun_list.append(noun)
				else:
					noun = (word[0], 'Sing' , i)
					noun_list.append(noun)
			if i == length - 1:
				nphrase += noun_list,
				nphrase_list.append(nphrase)
				inNP = False
				nphrase = ()
				noun_list = []
		elif inNP:
			nphrase += noun_list,
			nphrase_list.append(nphrase)
			inNP = False
			nphrase = ()
			noun_list = []
		i+=1                        
	return nphrase_list
예제 #12
0
파일: bias.py 프로젝트: connieimdialog/bsd
def extract_bias_features(text):
    features = OrderedDict()
    text_nohyph = text.replace(
        "-", " ")  # preserve hyphenated words as seperate tokens
    txt_lwr = str(text_nohyph).lower()
    words = ''.join(ch for ch in txt_lwr
                    if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split()
    unigrams = sorted(list(set(words)))
    bigram_tokens = find_ngrams(words, 2)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = find_ngrams(words, 3)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # presupposition verb count
    count = count_feature_list_freq(presup, words, bigrams, trigrams)
    features['presup_cnt'] = count
    features['presup_rto'] = round(float(count) / float(len(words)), 4)

    # coherence marker count
    count = count_phrase_freq(coherence, txt_lwr)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # partisan words and phrases count
    count = count_feature_list_freq(partisan, words, bigrams, trigrams)
    features['partisan_cnt'] = count
    features['partisan_rto'] = round(float(count) / float(len(words)), 4)

    # subjective value laden word count
    count = count_feature_list_freq(value_laden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # compound sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment
    features['vader_senti_abs'] = abs(compound_sentiment)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentence_obj = Sentence(sentence)
    features['modality'] = round(modality(sentence_obj), 4)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = flesch_kincaid_grade(text)

    # figurative count
    count = count_phrase_freq(figurative, txt_lwr)
    features['figurative_cnt'] = count
    features['figurative_rto'] = round(float(count) / float(len(words)), 4)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc self reference promouns count
    count = count_liwc_list_freq(liwc_self, words)
    features['liwc_self_cnt'] = count
    features['liwc_self_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # handle quoted material in text
    quote_dict = check_quotes(text)
    features["has_quotes"] = quote_dict["has_quotes"]
    features["mean_quote_length"] = quote_dict["mean_quote_length"]
    features["mean_nonquote_length"] = quote_dict["mean_nonquote_length"]
    return features
예제 #13
0
def get_modality_mood(text):
    t = parse(text, lemmata=True)
    t = Sentence(t)
    return modality(t), mood(t)
예제 #14
0
def get_modality_by_line(poem):
    return [round(modality(Sentence(parse(line, lemmata=True))), 1) for line in poem]
예제 #15
0
def get_mood_by_line(poem):
    return [mood(Sentence(parse(line, lemmata=True))) for line in poem]