def get_liwc_scores(wc, rows): categories = set() all_scores = [] count = 0 #print(len(rows)) for sent in rows: count += 1 liwc_scores = wc.score_text(sent) categories |= set(liwc_scores.keys()) #print("counted",count) category_list = sorted(list(categories)) count2 = 0 for sent in rows: liwc_scores = wc.score_text(sent) print(liwc_scores) #all_scores += [[row[col_name]] + [liwc_scores.get(category, 0.0) for category in category_list]] #print(all_scores) all_scores += [[ liwc_scores.get(category, 0.0) for category in category_list ]] #print(all_scores) #print(all_scores) count2 += 1 print("count2=", count2) return all_scores, category_list
def get_liwc_features(train_data, test_data): """ Creates a LIWC feature extractor. NOTE: this function is currently not being used in this program. """ print("getting liwc features") train_liwc_matrix = [] test_liwc_matrix = [] for phrase in train_data: liwc_scores = word_category_counter.score_text(phrase) feature_vector = [] for key in liwc_categories: if key in liwc_scores.keys(): # print(key) # print(liwc_scores[key]) feature_vector.append(liwc_scores[key]) else: feature_vector.append(0) # print(feature_vector) train_liwc_matrix.append(feature_vector) for phrase in test_data: liwc_scores = word_category_counter.score_text(phrase) feature_vector = [] for key in liwc_categories: if key in liwc_scores.keys(): # print(key) # print(liwc_scores[key]) feature_vector.append(liwc_scores[key]) else: feature_vector.append(0) test_liwc_matrix.append(feature_vector) # print(train_liwc_matrix) return sparse.csr_matrix(train_liwc_matrix), sparse.csr_matrix( test_liwc_matrix)
def populate_features_labels(annot_type, embed_dim, use_topic_only, use_accomodation_features=False): wc.load_dictionary(wc.default_dictionary_filename()) discussion_posts, triples = load_dicts() post_embeddings = load_document_proportions(embed_dim) # post_embeddings = load_embeddings() category_types = load_liwc_cat_groups() sent_cats = category_types['possent'] + category_types['negsent'] if use_accomodation_features: sent_cats += category_types['accomodation'] num_topics = len(topic_indices.keys()) num_triples = len(triples[annot_type]) dim = embed_dim * num_topics + len(sent_cats) if use_topic_only: dim = num_topics features = np.zeros((num_triples, dim)) outcome_map = {ct: np.zeros(num_triples) for ct in category_types} treatments = np.zeros(num_triples) for idx, triple in enumerate(triples[annot_type]): p1 = triple[0] p2 = triple[1] p3 = triple[2] annot_val = triple[3] did = triple[4] treatment = 1 if annot_val > 1 else 0 topic = discussion_posts[did][p1]['topic'] embed1 = post_embeddings[p1] embed2 = post_embeddings[p2] embed = np.hstack([embed1, embed2]) p1_liwc = wc.score_text(discussion_posts[did][p1]['text']) p3_liwc = wc.score_text(discussion_posts[did][p3]['text']) p1_sent_vec = get_liwc_vector(p1_liwc, sent_cats) tidx = topic_indices[topic] if use_topic_only: features[idx][tidx] = 1 else: features[idx, tidx * embed_dim:(tidx + 1) * embed_dim] = embed features[idx, dim - len(sent_cats):] = p1_sent_vec treatments[idx] = treatment for ct in category_types: outcome = compute_outcome(p1_liwc, p3_liwc, category_types[ct]) outcome_map[ct][idx] = outcome return features, treatments, outcome_map
def get_liwc_features(text, selected_features=None, get_all=False): feature_vectors = {} tokens, tags = get_tokens_tags(text, should_normalize=False) text = " ".join([' '.join([w for w in sent]) for sent in tokens]) liwc_scores = word_category_counter.score_text(text, raw_counts=True) if get_all == True: return liwc_scores negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] feature_vectors["liwc:neg_emotion"] = negative_score feature_vectors["liwc:pos_emotion"] = positive_score if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 feature_vectors["liwc:negative"] = 0 elif positive_score < negative_score: feature_vectors["liwc:positive"] = 0 feature_vectors["liwc:negative"] = 1 feature_vectors["liwc:swear_words"] = liwc_scores["Swear Words"] feature_vectors["liwc:anger"] = liwc_scores["Anger"] feature_vectors["liwc:health"] = liwc_scores["Health"] feature_vectors["liwc:money"] = liwc_scores["Money"] feature_vectors["liwc:pos_feelings"] = liwc_scores["Positive feelings"] feature_vectors["liwc:time"] = liwc_scores["Time"] return feature_vectors
def get_liwc_features(words, binning): """ Adds a simple LIWC derived feature :param words: :param binning: whether if we want to bin the values or not :return: """ # TODO: binning feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script for (key, value) in liwc_scores.items(): feature_vectors[key] = (bin(value) if binning else value) #if positive_score > negative_score: # feature_vectors["liwc:positive"] = 1 #else: # feature_vectors["liwc:negative"] = 1 return feature_vectors
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ # TODO: binning feature_vectors = {} feature_vectors.update({'Insight': 5}) feature_vectors.update({'Positive Emotion': 10}) feature_vectors.update({'Discrepancy': 3}) feature_vectors.update({'Discrepancy': 6}) feature_vectors.update({'Tentative': 6}) feature_vectors.update({'Negative Emotion': 5}) feature_vectors.update({'Positive Emotion': 7}) feature_vectors.update({'Positive Emotion': 11}) feature_vectors.update({'Discrepancy': 2}) feature_vectors.update({'Discrepancy': 4}) text = " ".join(words) liwc_scores = word_category_counter.score_text(text) # # All possible keys to the scores start on line 269 # # of the word_category_counter.py script negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] anger_score = liwc_scores['Anger'] insight_score = liwc_scores['Insight'] sadness_score = liwc_scores['Sadness'] discrepancy_score = liwc_scores['Discrepancy'] tentative_score = liwc_scores['Tentative'] feature_vectors["Negative Emotion"] = negative_score feature_vectors["Positive Emotion"] = positive_score feature_vectors['Anger'] = anger_score feature_vectors['Insight'] = insight_score feature_vectors['Discrepancy'] = discrepancy_score feature_vectors['Sadness'] = sadness_score feature_vectors['Tentative'] = tentative_score if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 if anger_score > sadness_score: feature_vectors['liwc:anger'] = 1 else: feature_vectors['liwc:sadness'] = 1 if insight_score > discrepancy_score: feature_vectors['liwc:insight'] = 1 else: feature_vectors['liwc:discrepancy'] = 1 if tentative_score > discrepancy_score: feature_vectors['liwc:tentative'] = 1 else: feature_vectors['liwc:discrepancy'] = 1 return feature_vectors
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script #negative_score = liwc_scores["Negative Emotion"] #positive_score = liwc_scores["Positive Emotion"] #perception_score = liwc_scores["Perceptual Processes"] #sadness_score = liwc_scores["Sadness"] #cogmech_score = liwc_scores["Cognitive Processes"] #motion_score = liwc_scores["Motion"] #feature_vector["liwc:neg"] = liwc_scores["Negative Emotion"] #feature_vector["liwc:pos"] = liwc_scores["Positive Emotion"] #feature_vector["liwc:perceive"] = liwc_scores["Perceptual Processes"] #feature_vector["liwc:sad"] = liwc_scores["Sadness"] #feature_vector["liwc:cogmech"] = liwc_scores["Cognitive Processes"] #feature_vector["liwc_motion"] = liwc_scores["Motion"] negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ # TODO: binning feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] feature_vectors["Negative Emotion"] = negative_score feature_vectors["Positive Emotion"] = positive_score feature_vectors["LIWC_OPTIMISM"] = liwc_scores["Optimism and energy"] # feature_vectors["LIWC_PAST_TENSE"] = liwc_scores["Past Tense"] feature_vectors["LIWC_PEOPLE"] = liwc_scores["Other references to people"] feature_vectors["LIWC_HUMANS"] = liwc_scores["Humans"] feature_vectors["LIWC_COMMUNINCATION"] = liwc_scores["Communication"] # feature_vectors["LIWC_SEE"] = liwc_scores["See"] feature_vectors["LIWC_NEGATIONS"] = liwc_scores["Negations"] if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 return feature_vectors
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ # TODO: binning feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) feats = {} for word, freq in liwc_scores.items(): bin_val = bin(freq) feats["LIWC_{}".format(word)] = bin_val feature_vectors.update(feats) # All possible keys to the scores start on line 269 # of the word_category_counter.py script negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] feature_vectors["Negative Emotion"] = negative_score feature_vectors["Positive Emotion"] = positive_score if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 return feature_vectors
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) #set 1 of liwc features negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1 feature_vector["liwc:anger" + "_" + str(bin(round(liwc_scores["Anger"])))] = 1 feature_vector["liwc:optimism" + "_" + str(bin(round(liwc_scores["Optimism and energy"])))] = 1 feature_vector["liwc:Swear_Words" + "_" + str(bin(round(liwc_scores["Swear Words"])))] = 1 feature_vector["liwc:sad" + "_" + str(bin(round(liwc_scores["Sadness"])))] = 1 #set 2 of liwc features feature_vector["liwc:Negations" + "_" + str(bin(round(liwc_scores["Negations"])))] = 1 feature_vector["liwc:Family" + "_" + str(bin(round(liwc_scores["Family"])))] = 1 feature_vector["liwc:Friends" + "_" + str(bin(round(liwc_scores["Friends"])))] = 1 feature_vector["liwc:Anxiety" + "_" + str(bin(round(liwc_scores["Anxiety"])))] = 1 feature_vector["liwc:Feel" + "_" + str(bin(round(liwc_scores["Feel"])))] = 1 feature_vector["liwc:Positive feelings" + "_" + str(bin(round(liwc_scores["Positive feelings"])))] = 1
def add_liwc_features(review): """ Args: (dataframe)review Returns: the LIWC score of our file. Currently we say if Posemo > Negemo, the file is pos. """ texts = review['text'].to_dict() liwc_dict = {} for key, text in texts.items(): # All possible keys to liwc_scores start on line 269 # of the word_category_counter.py script liwc_scores = word_category_counter.score_text(text) negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: liwc_dict[key] = {"liwc:positive":1} else: liwc_dict[key] = {"liwc:negative":1} #add liwc feature to existing feature def add_liwc_to_frequency(row): if 'frequency' in row: row['frequency'].update(row['liwc']) else: row['frequency'] = row['liwc'] review['liwc'] = pd.Series(liwc_dict).reset_index(drop=True) review.reset_index(drop=True) review.apply(add_liwc_to_frequency, axis=1) return review
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script negative_score = 0 positive_score = 0 # some values were too high so scalars are present negative_score += liwc_scores["Negative Emotion"] * 2 negative_score += liwc_scores["Anger"] negative_score += liwc_scores["Anxiety"] negative_score += liwc_scores["Sadness"] negative_score += liwc_scores["Metaphysical issues"] negative_score += liwc_scores["Death"] positive_score += liwc_scores["Positive Emotion"] / 6 positive_score += liwc_scores["Optimism and energy"] positive_score += liwc_scores["Achievement"] positive_score += liwc_scores["Future Tense"] if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 return feature_vectors
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script #negative_score = liwc_scores["Negative Emotion"] #positive_score = liwc_scores["Positive Emotion"] #perception_score = liwc_scores["Perceptual Processes"] #sadness_score = liwc_scores["Sadness"] #cogmech_score = liwc_scores["Cognitive Processes"] #motion_score = liwc_scores["Motion"] #feature_vector["liwc:neg"] = liwc_scores["Negative Emotion"] #feature_vector["liwc:pos"] = liwc_scores["Positive Emotion"] #feature_vector["liwc:perceive"] = liwc_scores["Perceptual Processes"] #feature_vector["liwc:sad"] = liwc_scores["Sadness"] #feature_vector["liwc:cogmech"] = liwc_scores["Cognitive Processes"] #feature_vector["liwc_motion"] = liwc_scores["Motion"] negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1
def get_liwc_scores(wc, rows, col_name): categories = set() all_scores = [] for row in rows: liwc_scores = wc.score_text(row[col_name]) categories |= set(liwc_scores.keys()) category_list = sorted(list(categories)) for row in rows: liwc_scores = wc.score_text(row[col_name]) print(liwc_scores) all_scores += [ [row[col_name]] + [liwc_scores.get(category, 0.0) for category in category_list] ] print(all_scores) return all_scores, category_list
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1
def apply_naive_bayes(row): sumOfClass1 = 0 sumOfClass2 = 0 if 'liwc' in row: liwc_scores = word_category_counter.score_text(row['text']) # liwc_dict[key] = liwc_scores negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: sumOfClass1 += bow[1]['liwc:positive'] else: sumOfClass2 += bow[0]["liwc:negative"] if 'uni_tokens' in row: uni_tokens = row['uni_tokens'] # algorithm for word in uni_tokens: if word in bow[1]: sumOfClass1 += math.log(bow[1][word], 10) else: sumOfClass1 += math.log( 1.0 / (uni_n_words_[1] + uni_unique_words), 10) if word in bow[0]: sumOfClass2 += math.log(bow[0][word], 10) else: sumOfClass2 += math.log( 1.0 / (uni_n_words_[0] + uni_unique_words), 10) if 'bi_tokens' in row: bi_tokens = row['bi_tokens'] # algorithm for word in bi_tokens: if word in bow[1]: sumOfClass1 += math.log(bow[1][word], 10) else: sumOfClass1 += math.log( 1.0 / (bi_n_words_[1] + bi_unique_words), 10) if word in bow[0]: sumOfClass2 += math.log(bow[0][word], 10) else: sumOfClass2 += math.log( 1.0 / (bi_n_words_[0] + bi_unique_words), 10) # print A,B,C # classification if priorPortion + sumOfClass1 - sumOfClass2 > 0: t = 1 else: t = 0 return t
def get_liwc_features(words, binning=None): """ Adds a simple LIWC derived feature :param words: :return: """ # TODO: binning feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) liwc_keys = liwc_scores.keys() # print(liwc_keys) for target in liwc_keys: feature_vectors["liwc:{0}".format(target)] = bin(liwc_scores[target]) # All possible keys to the scores start on line 269 # of the word_category_counter.py script # negative_score = liwc_scores["Negative Emotion"] # positive_score = liwc_scores["Positive Emotion"] # fps_score = liwc_scores["First Person Singular"] # affective_process_score = liwc_scores["Affective Processes"] # perceptual_score = liwc_scores["Perceptual Processes"] # cognitive_score = liwc_scores["Cognitive Processes"] # anxiety_score = liwc_scores["Anxiety"] # anger_score = liwc_scores["Anger"] # health_score = liwc_scores["Health"] # leisure_score = liwc_scores["Leisure"] # time_score = liwc_scores["Time"] # certainty_score = liwc_scores["Certainty"] # discrepency_score = liwc_scores["Discrepency"] # communication_score = liwc_scores["Communication"] # inclusive_score = liwc_scores["Inclusive"] # feature_vectors["Negative Emotion"] = bin(negative_score) # feature_vectors["Positive Emotion"] = bin(positive_score) # feature_vectors["First Person Singular"] = bin(fps_score) # feature_vectors["Affective Processes"] = bin(affective_process_score) # feature_vectors["Anxiety"] = bin(anxiety_score) # feature_vectors["Anger"] = bin(anger_score) # feature_vectors["Time"] = bin(time_score) # if positive_score > negative_score: # feature_vectors["liwc:positive"] = 1 # else: # feature_vectors["liwc:negative"] = 1 return feature_vectors
def get_liwc_features(text): feature_vectors = {} liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] feature_vectors["Negative Emotion"] = negative_score feature_vectors["Positive Emotion"] = positive_score if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 return feature_vectors
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text, raw_counts=True) # All possible keys to the scores start on line 269 # of the word_category_counter.py script liwc_categories = word_category_counter.Dictionary._liwc_categories for long_name, _, _, _, _ in liwc_categories: val = int(liwc_scores[long_name]) #feature_vectors["LIWC:{}".format(long_name.replace(" ", "-"))] = bin_liwc(val) feature_vectors["LIWC:{}".format(long_name.replace(" ", "-"))] = val return feature_vectors
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ # TODO: binning feature_vectors = {} newwords = [] for word in words: if word: newwords.append(word) text = " ".join(newwords) liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] anx_score = liwc_scores["Anxiety"] sad_score = liwc_scores["Sadness"] mad_score = liwc_scores["Anger"] cog_score = liwc_scores["Cognitive Processes"] per_score = liwc_scores["Perceptual Processes"] feature_vectors["Negative Emotion"] = negative_score feature_vectors["Positive Emotion"] = positive_score feature_vectors["Anxiety"] = anx_score feature_vectors["Sad"] = sad_score feature_vectors["Angry"] = mad_score feature_vectors["Thought"] = cog_score feature_vectors["Feel"] = per_score if positive_score > negative_score: feature_vectors["liwc:positive"] = 1 else: feature_vectors["liwc:negative"] = 1 return feature_vectors
def get_liwc_features(words): """ Adds a simple LIWC derived feature :param words: :return: """ feature_vectors = {} text = " ".join(words) liwc_scores = word_category_counter.score_text(text) # All possible keys to the scores start on line 269 # of the word_category_counter.py script #negative_score = liwc_scores["Negative Emotion"] #positive_score = liwc_scores["Positive Emotion"] #feature_vectors["Negative Emotion"] = liwc_bin(negative_score) #feature_vectors["Positive Emotion"] = liwc_bin(positive_score) for tup in liwc_scores: feature_vectors["LIWC:" + tup] = liwc_bin(liwc_scores[tup]) return feature_vectors
def add_liwc_features(text, feature_vector): liwc_scores = word_category_counter.score_text(text) #set 1 of liwc features negative_score = liwc_scores["Negative Emotion"] positive_score = liwc_scores["Positive Emotion"] if positive_score > negative_score: feature_vector["liwc:positive"] = 1 else: feature_vector["liwc:negative"] = 1 feature_vector["liwc:anger" + "_" + str(bin(round(liwc_scores["Anger"])))] = 1 feature_vector["liwc:optimism"+ "_" + str(bin(round(liwc_scores["Optimism and energy"])))] = 1 feature_vector["liwc:Swear_Words"+ "_" + str(bin(round(liwc_scores["Swear Words"])))] = 1 feature_vector["liwc:sad"+ "_" + str(bin(round(liwc_scores["Sadness"])))] = 1 #set 2 of liwc features feature_vector["liwc:Negations" +"_" + str(bin(round(liwc_scores["Negations"])))] = 1 feature_vector["liwc:Family"+ "_" + str(bin(round(liwc_scores["Family"])))] = 1 feature_vector["liwc:Friends"+ "_" + str(bin(round(liwc_scores["Friends"])))] = 1 feature_vector["liwc:Anxiety"+ "_" + str(bin(round(liwc_scores["Anxiety"])))] = 1 feature_vector["liwc:Feel"+ "_" + str(bin(round(liwc_scores["Feel"])))] = 1 feature_vector["liwc:Positive feelings"+ "_" + str(bin(round(liwc_scores["Positive feelings"])))] = 1
sent_input = [] with open(sent_file, 'rU') as csvfile: label_reader = csv.reader(csvfile) for row in label_reader: sent_input += [row] wc.load_dictionary(wc.default_dictionary_filename()) csv_op = [[ "Filename", "Sentence", "Positive Emotion", "Negative Emotion", "Sadness", "Anger", "Anxiety" ]] for pair in sent_input: name = pair[0] sentence = pair[1] liwc = wc.score_text(sentence) if liwc["Positive Emotion"] > liwc["Negative Emotion"]: x = 1 elif liwc["Positive Emotion"] < liwc["Negative Emotion"]: x = -1 else: x = 0 csv_op += [[ name, sentence, liwc["Positive Emotion"], liwc["Negative Emotion"], liwc["Sadness"], liwc["Anger"], liwc["Anxiety"], x ]] b = open(sent_file[:-4] + "_LIWC_Emotions.csv", "w") a = csv.writer(b) a.writerows(csv_op)
def process_lyrics( artist_name ): filename = artist_name + "EXT.txt" with open(filename, 'r') as f: raw_songs = f.read() print "Processing: ", filename # Taking the title and credits out of the file # Title pat1 = re.compile(r'(.+?LYRICS)') head = pat1.search(raw_songs).expand(r'\1') pat2 = re.compile(r'(".+?")') match2 = pat2.search(head).expand(r'\1') extraneous_crap = head[len(match2):] clean_raw_songs = raw_songs.replace(extraneous_crap, '') # Credits pattern = re.compile(r'(Visit www\.azlyrics.*?Search)') credits = re.findall(pattern, raw_songs) for credit in credits: clean_raw_songs = clean_raw_songs.replace(credit, ' ') raw_songs = clean_raw_songs # New line characters were mistakenly removed in the scraping process, # but they were replaced with ' ' and there were two of them, so here # we can use ' ' to split the lines, instead of .splitlines() raw_lines = raw_songs.split(' ') song_sents = [nltk.word_tokenize(line) for line in raw_lines] song_words = [word.lower() for sent in song_sents for word in sent] words = [] stops = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'now', 'went', 'asked', 'was'] words = song_words #for word in song_words: # if len(re.findall(r'\w', word)) > 0: # if word not in stops: # words.append(word) #fdist = nltk.FreqDist(words) #print fdist.B() #print fdist.N() #print fdist.items()[:20], "\n\n" #bigram_measures = nltk.collocations.BigramAssocMeasures() #finder = BigramCollocationFinder.from_words(words) #finder.apply_freq_filter(3) #print finder.nbest(bigram_measures.pmi, 10), "\n\n" #print finder.ngram_fd.viewitems(), "\n\n" #bigrams = nltk.bigrams(words) #bfdist = nltk.FreqDist(bigrams) #print bfdist.items()[:20], "\n\n" liwc_scores = word_category_counter.score_text(raw_songs) normalized_liwc_scores = word_category_counter.normalize_scores(liwc_scores) outfile = "SCORES/" + artist_name + "SCORES.txt" with open(outfile, 'w') as outf: for name, value in normalized_liwc_scores.items(): outf.write("{0}\n{1}\n".format(name, value)) outfile = "SCORES/" + artist_name + "SCORES.pickle" with open(outfile, 'w') as outf: pickle.dump(normalized_liwc_scores, outf)