def preprocess(sentences): # Tokenize sentences tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4") model = AutoModel.from_pretrained("sentence-transformers/ce-ms-marco-TinyBERT-L-4") encoded_input = tokenizer(sentences.to_list(), padding=True, truncation=True, max_length=128, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling. In this case, mean pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) sentiment_train = sentences.apply(lambda x: sentiment(x)) sentiment_train = pd.DataFrame(sentiment_train.values.tolist(), columns=['polarity', 'subjectivity'], index=sentences.index) parse_s = sentences.apply(lambda x: parse(x, lemmata=True)) sent = parse_s.apply(lambda x: Sentence(x)) modality_s = pd.DataFrame(sent.apply(lambda x: modality(x))) meta_df = sentiment_train.merge(modality_s, left_index=True, right_index=True) input_matrix = pd.concat([meta_df.reset_index(drop=True), pd.DataFrame(sentence_embeddings)], axis=1) return input_matrix
def get_word2vec_vocab(): """ Gets the vocabulary from pretrained word2vec and writes a new file with all the vocabulary in a new file The file contains a single word per line. Over 2 billion words :return: Nothing """ print("Extracting word2vec vocabulary") model = KeyedVectors.load_word2vec_format( "data/GoogleNews-vectors-negative300.bin", binary=True) vocab = model.wv.vocab w = open("extracted/lists/GoogleNews-vectors-negative300", "w+") with tqdm(total=len(vocab)) as pbar: for v in vocab: if len(parse(v + "\n")) > 0: if parse(v + "\n").split("/")[1] == "NNP" and " ".join( v.lower().split("_")).title() == " ".join( v.split("_")): w.write(v + "\n") pbar.update(1) w.close()
def get_word2vec_vocab(): model = KeyedVectors.load_word2vec_format( "data/GoogleNews-vectors-negative300.bin", binary=True) vocab = model.wv.vocab w = open("extracted/lists/GoogleNews-vectors-negative300_NNP", "w+") with tqdm(total=len(vocab)) as pbar: for v in vocab: if parse(v + "\n").split("/")[1] == "NNP" and " ".join( v.lower().split("_").capitalize()) == " ".join( v.split("_")): w.write(v + "\n") pbar.update(1) w.close()
def lemmatize_sentence(content, allowed_tags=nltk.re.compile(''), stopwords=frozenset(), min_length=1, max_length=100): """ This function is only available when the optional 'pattern' package is installed. Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] """ # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = gensim.utils.u(' ').join( gensim.utils.tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if min_length <= len(lemma) <= max_length and not lemma.startswith( '_') and lemma not in stopwords: if allowed_tags.match(tag): result.append((token, tag, lemma)) return result
def collect_sentence_data(sent): ####################################################################################################### # was going to use this NER code to detect gendered referents but the false positive rate was insane! # # for instance, it inferred all instances of 'Cars' as well as'Great Britain' to be people!?!?!?!?!?! # ####################################################################################################### #people = [] #for chunk in nltk.ne_chunk(sent.tags,binary=False): # if hasattr(chunk, '_label'): # person = str() # if chunk._label == 'PERSON': # print("found person in '" + str(chunk)) # for word in chunk: # person += ' ' + word[0].lower() p = parse(sent.string) inNP = False words = p.split() #list of noun phrases. each phrase is a tuple of a 0 or 1 to indicate if the current phrase is in a prepositional phrase and a noun list and each noun is a tuple of noun and 'sing' or 'plur' nphrase_list = [] noun_list = [] nphrase = () length = len(words[0]) i = 0 for word in words[0]: if 'NP' in word[2]: if not inNP: #this indicates we're in the first word in a noun phrase inNP = True if 'PNP' in word[3]: #this indicates we're in a prep phrase nphrase += 1, else: nphrase += 0, if 'NN' in word[1]: #current word is noun if 'S' in word[1]: noun = (word[0], 'Plur', i) noun_list.append(noun) else: noun = (word[0], 'Sing', i) noun_list.append(noun) elif 'PRP' in word[1]: #current word is pronoun if 'S' in word[1]: noun = (word[0], 'Plur', i) noun_list.append(noun) else: noun = (word[0], 'Sing', i) noun_list.append(noun) if i == length - 1: nphrase += noun_list, nphrase_list.append(nphrase) inNP = False nphrase = () noun_list = [] elif inNP: nphrase += noun_list, nphrase_list.append(nphrase) inNP = False nphrase = () noun_list = [] i += 1 return nphrase_list
for v in vocab: w.write(v + "\n") pbar.update(1) w.close() w = open("extracted/lists/vocab_word2vec_POS.txt", "w+") with tqdm(total=len(vocab)) as pbar: for v in vocab: w.write(tag(v) + "\n") pbar.update(1) w.close() w = open("extracted/lists/" + checkpoint + "_parse.txt", "w+") with tqdm(total=len(vocab)) as pbar: for v in vocab: w.write(parse(str(v) + "\n") + "\n") pbar.update(1) w.close() # These names are in game_names, obtained in scrape.py from taking words before the pattern "is a * game" # The list is small and noisy. We will take a single popular game, # top100 = my_model.model.most_similar('Borderlands', topn=100) tops_adj = { 'Borderlands': None, 'facets': None, 'adjective': None, 'type': None, 'multiplayer': None, 'characteristics': None, 'gameplay': None, # Good example for 3- find adjectives (or other information) characteristics of a facet of a game
def extract_bias_features(text, do_get_caster=False): features = OrderedDict() if sys.version_info < (3, 0): # ignore conversion errors between utf-8 and ascii text = text.decode('ascii', 'ignore') text_nohyph = text.replace( "-", " ") # preserve hyphenated words as separate tokens txt_lwr = str(text_nohyph).lower() words = ''.join(ch for ch in txt_lwr if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split() unigrams = sorted(list(set(words))) bigram_tokens = find_ngrams(words, 2) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = find_ngrams(words, 3) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] ## SENTENCE LEVEL MEASURES # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = flesch_kincaid_grade(text) # compound sentiment score using VADER sentiment analysis package vader_sentiment = vader_sentiment_analysis.polarity_scores(text) vader_negative_proportion = vader_sentiment['neg'] vader_compound_sentiment = vader_sentiment['compound'] features['vader_sentiment'] = vader_compound_sentiment features['vader_senti_abs'] = abs(vader_compound_sentiment) # negative-perspective features['neg_persp'] = check_neg_persp(words, vader_negative_proportion, vader_compound_sentiment) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['certainty'] = round(modality(sentence_obj), 4) # quoted material quote_dict = check_quotes(text) features["has_quotes"] = quote_dict["has_quotes"] features["quote_length"] = quote_dict["mean_quote_length"] features["nonquote_length"] = quote_dict["mean_nonquote_length"] ## LEXICON LEVEL MEASURES # presupposition markers count = count_feature_freq(presup, words, txt_lwr) features['presup_cnt'] = count features['presup_rto'] = round(old_div(float(count), float(len(words))), 4) # doubt markers count = count_feature_freq(doubt, words, txt_lwr) features['doubt_cnt'] = count features['doubt_rto'] = round(old_div(float(count), float(len(words))), 4) # partisan words and phrases count = count_feature_freq(partisan, words, txt_lwr) features['partisan_cnt'] = count features['partisan_rto'] = round(old_div(float(count), float(len(words))), 4) # subjective value laden word count count = count_feature_freq(value_laden, words, txt_lwr) features['value_cnt'] = count features['value_rto'] = round(old_div(float(count), float(len(words))), 4) # figurative language markers count = count_feature_freq(figurative, words, txt_lwr) features['figurative_cnt'] = count features['figurative_rto'] = round( old_div(float(count), float(len(words))), 4) # attribution markers count = count_feature_freq(attribution, words, txt_lwr) features['attribution_cnt'] = count features['attribution_rto'] = round( old_div(float(count), float(len(words))), 4) # self reference pronouns count = count_feature_freq(self_refer, words, txt_lwr) features['self_refer_cnt'] = count features['self_refer_rto'] = round( old_div(float(count), float(len(words))), 4) # Contextual Aspect Summary and Topical-Entity Recognition (CASTER) if do_get_caster: """ May incur a performance cost in time to process """ caster_dict = get_caster(text) features['caster_dict'] = caster_dict return features
stemmed_question2 = stemmed_questions_pairs[i][1] question1.set_lemmatized_question_tokens(lemmatized_question1) question2.set_lemmatized_question_tokens(lemmatized_question2) question1.set_stemmed_question_tokens(stemmed_question1) question2.set_stemmed_question_tokens(stemmed_question2) question1.set_question_tokens(tokenized_question1) question2.set_question_tokens(tokenized_question2) question1_parsetree = parsetree(question1_string, relations=True) question2_parsetree = parsetree(question2_string, relations=True) if len(question1_parsetree) == 1: parse_string = str(parse(question1_string, relations=True)) question1.set_parse_tree_string(parse_string) tokens_tags, tokens_roles, tokens_chunks = {}, {}, {} for sentence in question1_parsetree: for chunk in sentence.chunks: for word in chunk.words: #print(chunk.role) tokens_tags[(int(word.index), str(word.string))] = word.tag tokens_chunks[(int(word.index), str(word.string))] = word.chunk tokens_roles[(int(word.index), str(word.string))] = chunk.role question1.set_tokens_tags(tokens_tags) question1.set_tokens_chunks(tokens_chunks)
def extract_bias_features(text): features = {} text = unicode(text, errors='ignore') if not isinstance(text, unicode) else text txt_lwr = str(text).lower() words = nltk.word_tokenize(txt_lwr) words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$'] unigrams = sorted(list(set(words))) bigram_tokens = nltk.bigrams(words) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = nltk.trigrams(words) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # coherence marker count count = count_feature_list_freq(coherence, words, bigrams, trigrams) features['cm_cnt'] = count features['cm_rto'] = round(float(count) / float(len(words)), 4) # degree modifier count count = count_feature_list_freq(modifiers, words, bigrams, trigrams) features['dm_cnt'] = count features['dm_rto'] = round(float(count) / float(len(words)), 4) # hedge word count count = count_feature_list_freq(hedges, words, bigrams, trigrams) features['hedge_cnt'] = count features['hedge_rto'] = round(float(count) / float(len(words)), 4) # factive verb count count = count_feature_list_freq(factives, words, bigrams, trigrams) features['factive_cnt'] = count features['factive_rto'] = round(float(count) / float(len(words)), 4) # assertive verb count count = count_feature_list_freq(assertives, words, bigrams, trigrams) features['assertive_cnt'] = count features['assertive_rto'] = round(float(count) / float(len(words)), 4) # implicative verb count count = count_feature_list_freq(implicatives, words, bigrams, trigrams) features['implicative_cnt'] = count features['implicative_rto'] = round(float(count) / float(len(words)), 4) # bias words and phrases count count = count_feature_list_freq(biased, words, bigrams, trigrams) features['bias_cnt'] = count features['bias_rto'] = round(float(count) / float(len(words)), 4) # opinion word count count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams) features['opinion_cnt'] = count features['opinion_rto'] = round(float(count) / float(len(words)), 4) # weak subjective word count count = count_feature_list_freq(subj_weak, words, bigrams, trigrams) features['subj_weak_cnt'] = count features['subj_weak_rto'] = round(float(count) / float(len(words)), 4) # strong subjective word count count = count_feature_list_freq(subj_strong, words, bigrams, trigrams) features['subj_strong_cnt'] = count features['subj_strong_rto'] = round(float(count) / float(len(words)), 4) # composite sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores( text)['compound'] features['vader_sentiment'] = compound_sentiment # subjectivity score using Pattern.en pattern_subjectivity = pattern_sentiment(text)[1] features['subjectivity'] = round(pattern_subjectivity, 4) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['modality'] = round(modality(sentence_obj), 4) features['mood'] = mood(sentence_obj) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = textstat.flesch_kincaid_grade(text) # liwc 3rd person pronoun count (combines S/he and They) count = count_liwc_list_freq(liwc_3pp, words) features['liwc_3pp_cnt'] = count features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4) # liwc auxiliary verb count count = count_liwc_list_freq(liwc_aux, words) features['liwc_aux_cnt'] = count features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4) # liwc adverb count count = count_liwc_list_freq(liwc_adv, words) features['liwc_adv_cnt'] = count features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4) # liwc preposition count count = count_liwc_list_freq(liwc_prep, words) features['liwc_prep_cnt'] = count features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4) # liwc conjunction count count = count_liwc_list_freq(liwc_conj, words) features['liwc_conj_cnt'] = count features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4) # liwc discrepency word count count = count_liwc_list_freq(liwc_discr, words) features['liwc_discr_cnt'] = count features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4) # liwc tentative word count count = count_liwc_list_freq(liwc_tent, words) features['liwc_tent_cnt'] = count features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4) # liwc certainty word count count = count_liwc_list_freq(liwc_cert, words) features['liwc_cert_cnt'] = count features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4) # liwc causation word count count = count_liwc_list_freq(liwc_causn, words) features['liwc_causn_cnt'] = count features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4) # liwc work word count count = count_liwc_list_freq(liwc_work, words) features['liwc_work_cnt'] = count features['liwc_work_rto'] = round(float(count) / float(len(words)), 4) # liwc achievement word count count = count_liwc_list_freq(liwc_achiev, words) features['liwc_achiev_cnt'] = count features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4) return features
def collect_sentence_data(sent): ####################################################################################################### # was going to use this NER code to detect gendered referents but the false positive rate was insane! # # for instance, it inferred all instances of 'Cars' as well as'Great Britain' to be people!?!?!?!?!?! # ####################################################################################################### #people = [] #for chunk in nltk.ne_chunk(sent.tags,binary=False): # if hasattr(chunk, '_label'): # person = str() # if chunk._label == 'PERSON': # print("found person in '" + str(chunk)) # for word in chunk: # person += ' ' + word[0].lower() p = parse(sent.string) inNP = False words = p.split() #list of noun phrases. each phrase is a tuple of a 0 or 1 to indicate if the current phrase is in a prepositional phrase and a noun list and each noun is a tuple of noun and 'sing' or 'plur' nphrase_list = [] noun_list = [] nphrase = () length = len(words[0]) i = 0 for word in words[0]: if 'NP' in word[2]: if not inNP: #this indicates we're in the first word in a noun phrase inNP = True if 'PNP' in word[3]: #this indicates we're in a prep phrase nphrase += 1, else: nphrase += 0, if 'NN' in word[1]: #current word is noun if 'S' in word[1]: noun = (word[0], 'Plur', i) noun_list.append(noun) else: noun = (word[0], 'Sing', i) noun_list.append(noun) elif 'PRP' in word[1]: #current word is pronoun if 'S' in word[1]: noun = (word[0], 'Plur' , i) noun_list.append(noun) else: noun = (word[0], 'Sing' , i) noun_list.append(noun) if i == length - 1: nphrase += noun_list, nphrase_list.append(nphrase) inNP = False nphrase = () noun_list = [] elif inNP: nphrase += noun_list, nphrase_list.append(nphrase) inNP = False nphrase = () noun_list = [] i+=1 return nphrase_list
def extract_bias_features(text): features = OrderedDict() text_nohyph = text.replace( "-", " ") # preserve hyphenated words as seperate tokens txt_lwr = str(text_nohyph).lower() words = ''.join(ch for ch in txt_lwr if ch not in '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~').split() unigrams = sorted(list(set(words))) bigram_tokens = find_ngrams(words, 2) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = find_ngrams(words, 3) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # presupposition verb count count = count_feature_list_freq(presup, words, bigrams, trigrams) features['presup_cnt'] = count features['presup_rto'] = round(float(count) / float(len(words)), 4) # coherence marker count count = count_phrase_freq(coherence, txt_lwr) features['cm_cnt'] = count features['cm_rto'] = round(float(count) / float(len(words)), 4) # assertive verb count count = count_feature_list_freq(assertives, words, bigrams, trigrams) features['assertive_cnt'] = count features['assertive_rto'] = round(float(count) / float(len(words)), 4) # degree modifier count count = count_feature_list_freq(modifiers, words, bigrams, trigrams) features['dm_cnt'] = count features['dm_rto'] = round(float(count) / float(len(words)), 4) # hedge word count count = count_feature_list_freq(hedges, words, bigrams, trigrams) features['hedge_cnt'] = count features['hedge_rto'] = round(float(count) / float(len(words)), 4) # partisan words and phrases count count = count_feature_list_freq(partisan, words, bigrams, trigrams) features['partisan_cnt'] = count features['partisan_rto'] = round(float(count) / float(len(words)), 4) # subjective value laden word count count = count_feature_list_freq(value_laden, words, bigrams, trigrams) features['opinion_cnt'] = count features['opinion_rto'] = round(float(count) / float(len(words)), 4) # compound sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores( text)['compound'] features['vader_sentiment'] = compound_sentiment features['vader_senti_abs'] = abs(compound_sentiment) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentence_obj = Sentence(sentence) features['modality'] = round(modality(sentence_obj), 4) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = flesch_kincaid_grade(text) # figurative count count = count_phrase_freq(figurative, txt_lwr) features['figurative_cnt'] = count features['figurative_rto'] = round(float(count) / float(len(words)), 4) # liwc 3rd person pronoun count (combines S/he and They) count = count_liwc_list_freq(liwc_3pp, words) features['liwc_3pp_cnt'] = count features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4) # liwc achievement word count count = count_liwc_list_freq(liwc_achiev, words) features['liwc_achiev_cnt'] = count features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4) # liwc causation word count count = count_liwc_list_freq(liwc_causn, words) features['liwc_causn_cnt'] = count features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4) # liwc self reference promouns count count = count_liwc_list_freq(liwc_self, words) features['liwc_self_cnt'] = count features['liwc_self_rto'] = round(float(count) / float(len(words)), 4) # liwc tentative word count count = count_liwc_list_freq(liwc_tent, words) features['liwc_tent_cnt'] = count features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4) # liwc work word count count = count_liwc_list_freq(liwc_work, words) features['liwc_work_cnt'] = count features['liwc_work_rto'] = round(float(count) / float(len(words)), 4) # handle quoted material in text quote_dict = check_quotes(text) features["has_quotes"] = quote_dict["has_quotes"] features["mean_quote_length"] = quote_dict["mean_quote_length"] features["mean_nonquote_length"] = quote_dict["mean_nonquote_length"] return features
def get_modality_mood(text): t = parse(text, lemmata=True) t = Sentence(t) return modality(t), mood(t)
def get_modality_by_line(poem): return [round(modality(Sentence(parse(line, lemmata=True))), 1) for line in poem]
def get_mood_by_line(poem): return [mood(Sentence(parse(line, lemmata=True))) for line in poem]