def get_liwc_scores(wc, rows):
    categories = set()
    all_scores = []
    count = 0
    #print(len(rows))
    for sent in rows:
        count += 1
        liwc_scores = wc.score_text(sent)
        categories |= set(liwc_scores.keys())
    #print("counted",count)
    category_list = sorted(list(categories))
    count2 = 0
    for sent in rows:
        liwc_scores = wc.score_text(sent)
        print(liwc_scores)
        #all_scores += [[row[col_name]] + [liwc_scores.get(category, 0.0) for category in category_list]]
        #print(all_scores)
        all_scores += [[
            liwc_scores.get(category, 0.0) for category in category_list
        ]]
        #print(all_scores)
        #print(all_scores)
        count2 += 1
    print("count2=", count2)
    return all_scores, category_list
Пример #2
0
def get_liwc_features(train_data, test_data):
    """
        Creates a LIWC feature extractor.
        NOTE: this function is currently not being used in this program.
    """
    print("getting liwc features")
    train_liwc_matrix = []
    test_liwc_matrix = []
    for phrase in train_data:
        liwc_scores = word_category_counter.score_text(phrase)
        feature_vector = []
        for key in liwc_categories:
            if key in liwc_scores.keys():
                # print(key)
                # print(liwc_scores[key])
                feature_vector.append(liwc_scores[key])
            else:
                feature_vector.append(0)
        # print(feature_vector)
        train_liwc_matrix.append(feature_vector)
    for phrase in test_data:
        liwc_scores = word_category_counter.score_text(phrase)
        feature_vector = []
        for key in liwc_categories:
            if key in liwc_scores.keys():
                # print(key)
                # print(liwc_scores[key])
                feature_vector.append(liwc_scores[key])
            else:
                feature_vector.append(0)
        test_liwc_matrix.append(feature_vector)
    # print(train_liwc_matrix)
    return sparse.csr_matrix(train_liwc_matrix), sparse.csr_matrix(
        test_liwc_matrix)
Пример #3
0
def populate_features_labels(annot_type,
                             embed_dim,
                             use_topic_only,
                             use_accomodation_features=False):

    wc.load_dictionary(wc.default_dictionary_filename())

    discussion_posts, triples = load_dicts()
    post_embeddings = load_document_proportions(embed_dim)
    # post_embeddings = load_embeddings()
    category_types = load_liwc_cat_groups()
    sent_cats = category_types['possent'] + category_types['negsent']

    if use_accomodation_features:
        sent_cats += category_types['accomodation']

    num_topics = len(topic_indices.keys())
    num_triples = len(triples[annot_type])
    dim = embed_dim * num_topics + len(sent_cats)

    if use_topic_only:
        dim = num_topics

    features = np.zeros((num_triples, dim))
    outcome_map = {ct: np.zeros(num_triples) for ct in category_types}
    treatments = np.zeros(num_triples)

    for idx, triple in enumerate(triples[annot_type]):
        p1 = triple[0]
        p2 = triple[1]
        p3 = triple[2]
        annot_val = triple[3]
        did = triple[4]

        treatment = 1 if annot_val > 1 else 0
        topic = discussion_posts[did][p1]['topic']

        embed1 = post_embeddings[p1]
        embed2 = post_embeddings[p2]
        embed = np.hstack([embed1, embed2])

        p1_liwc = wc.score_text(discussion_posts[did][p1]['text'])
        p3_liwc = wc.score_text(discussion_posts[did][p3]['text'])
        p1_sent_vec = get_liwc_vector(p1_liwc, sent_cats)

        tidx = topic_indices[topic]
        if use_topic_only:
            features[idx][tidx] = 1
        else:
            features[idx, tidx * embed_dim:(tidx + 1) * embed_dim] = embed
            features[idx, dim - len(sent_cats):] = p1_sent_vec

        treatments[idx] = treatment

        for ct in category_types:
            outcome = compute_outcome(p1_liwc, p3_liwc, category_types[ct])
            outcome_map[ct][idx] = outcome

    return features, treatments, outcome_map
Пример #4
0
def get_liwc_features(text, selected_features=None, get_all=False):
    feature_vectors = {}
    tokens, tags = get_tokens_tags(text, should_normalize=False)

    text = " ".join([' '.join([w for w in sent]) for sent in tokens])
    liwc_scores = word_category_counter.score_text(text, raw_counts=True)

    if get_all == True:
        return liwc_scores

    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    feature_vectors["liwc:neg_emotion"] = negative_score
    feature_vectors["liwc:pos_emotion"] = positive_score

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
        feature_vectors["liwc:negative"] = 0
    elif positive_score < negative_score:
        feature_vectors["liwc:positive"] = 0
        feature_vectors["liwc:negative"] = 1

    feature_vectors["liwc:swear_words"] = liwc_scores["Swear Words"]
    feature_vectors["liwc:anger"] = liwc_scores["Anger"]
    feature_vectors["liwc:health"] = liwc_scores["Health"]
    feature_vectors["liwc:money"] = liwc_scores["Money"]
    feature_vectors["liwc:pos_feelings"] = liwc_scores["Positive feelings"]
    feature_vectors["liwc:time"] = liwc_scores["Time"]

    return feature_vectors
Пример #5
0
def get_liwc_features(words, binning):
    """
    Adds a simple LIWC derived feature

    :param words:
    :param binning: whether if we want to bin the values or not
    :return:
    """

    # TODO: binning

    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    for (key, value) in liwc_scores.items():
        feature_vectors[key] = (bin(value) if binning else value)

    #if positive_score > negative_score:
    #    feature_vectors["liwc:positive"] = 1
    #else:
    #    feature_vectors["liwc:negative"] = 1

    return feature_vectors
Пример #6
0
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """

    # TODO: binning

    feature_vectors = {}
    feature_vectors.update({'Insight': 5})
    feature_vectors.update({'Positive Emotion': 10})
    feature_vectors.update({'Discrepancy': 3})
    feature_vectors.update({'Discrepancy': 6})
    feature_vectors.update({'Tentative': 6})
    feature_vectors.update({'Negative Emotion': 5})
    feature_vectors.update({'Positive Emotion': 7})
    feature_vectors.update({'Positive Emotion': 11})
    feature_vectors.update({'Discrepancy': 2})
    feature_vectors.update({'Discrepancy': 4})
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    # # All possible keys to the scores start on line 269
    # # of the word_category_counter.py script
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    anger_score = liwc_scores['Anger']
    insight_score = liwc_scores['Insight']
    sadness_score = liwc_scores['Sadness']
    discrepancy_score = liwc_scores['Discrepancy']
    tentative_score = liwc_scores['Tentative']
    feature_vectors["Negative Emotion"] = negative_score
    feature_vectors["Positive Emotion"] = positive_score
    feature_vectors['Anger'] = anger_score
    feature_vectors['Insight'] = insight_score
    feature_vectors['Discrepancy'] = discrepancy_score
    feature_vectors['Sadness'] = sadness_score
    feature_vectors['Tentative'] = tentative_score

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    if anger_score > sadness_score:
        feature_vectors['liwc:anger'] = 1
    else:
        feature_vectors['liwc:sadness'] = 1

    if insight_score > discrepancy_score:
        feature_vectors['liwc:insight'] = 1
    else:
        feature_vectors['liwc:discrepancy'] = 1
    if tentative_score > discrepancy_score:
        feature_vectors['liwc:tentative'] = 1
    else:
        feature_vectors['liwc:discrepancy'] = 1
    return feature_vectors
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)
    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    #negative_score = liwc_scores["Negative Emotion"]
    #positive_score = liwc_scores["Positive Emotion"]
    #perception_score = liwc_scores["Perceptual Processes"]
    #sadness_score = liwc_scores["Sadness"]
    #cogmech_score = liwc_scores["Cognitive Processes"]
    #motion_score = liwc_scores["Motion"]

    #feature_vector["liwc:neg"] = liwc_scores["Negative Emotion"]
    #feature_vector["liwc:pos"] = liwc_scores["Positive Emotion"]
    #feature_vector["liwc:perceive"] = liwc_scores["Perceptual Processes"]
    #feature_vector["liwc:sad"] = liwc_scores["Sadness"]
    #feature_vector["liwc:cogmech"] = liwc_scores["Cognitive Processes"]
    #feature_vector["liwc_motion"] = liwc_scores["Motion"]
    
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1
Пример #8
0
def get_liwc_features(words):
    """
	Adds a simple LIWC derived feature

	:param words:
	:return:
	"""

    # TODO: binning

    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    feature_vectors["Negative Emotion"] = negative_score
    feature_vectors["Positive Emotion"] = positive_score

    feature_vectors["LIWC_OPTIMISM"] = liwc_scores["Optimism and energy"]
    # feature_vectors["LIWC_PAST_TENSE"] = liwc_scores["Past Tense"]
    feature_vectors["LIWC_PEOPLE"] = liwc_scores["Other references to people"]
    feature_vectors["LIWC_HUMANS"] = liwc_scores["Humans"]
    feature_vectors["LIWC_COMMUNINCATION"] = liwc_scores["Communication"]
    # feature_vectors["LIWC_SEE"] = liwc_scores["See"]
    feature_vectors["LIWC_NEGATIONS"] = liwc_scores["Negations"]

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    return feature_vectors
Пример #9
0
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """

    # TODO: binning

    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    feats = {}
    for word, freq in liwc_scores.items():
        bin_val = bin(freq)
        feats["LIWC_{}".format(word)] = bin_val
    feature_vectors.update(feats)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    feature_vectors["Negative Emotion"] = negative_score
    feature_vectors["Positive Emotion"] = positive_score

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    return feature_vectors
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)

    #set 1 of liwc features
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1

    feature_vector["liwc:anger" + "_" +
                   str(bin(round(liwc_scores["Anger"])))] = 1
    feature_vector["liwc:optimism" + "_" +
                   str(bin(round(liwc_scores["Optimism and energy"])))] = 1
    feature_vector["liwc:Swear_Words" + "_" +
                   str(bin(round(liwc_scores["Swear Words"])))] = 1
    feature_vector["liwc:sad" + "_" +
                   str(bin(round(liwc_scores["Sadness"])))] = 1

    #set 2 of liwc features
    feature_vector["liwc:Negations" + "_" +
                   str(bin(round(liwc_scores["Negations"])))] = 1
    feature_vector["liwc:Family" + "_" +
                   str(bin(round(liwc_scores["Family"])))] = 1
    feature_vector["liwc:Friends" + "_" +
                   str(bin(round(liwc_scores["Friends"])))] = 1
    feature_vector["liwc:Anxiety" + "_" +
                   str(bin(round(liwc_scores["Anxiety"])))] = 1
    feature_vector["liwc:Feel" + "_" +
                   str(bin(round(liwc_scores["Feel"])))] = 1
    feature_vector["liwc:Positive feelings" + "_" +
                   str(bin(round(liwc_scores["Positive feelings"])))] = 1
Пример #11
0
def add_liwc_features(review):
  """
  Args:
      (dataframe)review

  Returns:
      the LIWC score of our file. Currently we say if Posemo > Negemo, the file is pos.

  """
  texts = review['text'].to_dict()
  liwc_dict = {}
  for key, text in texts.items():
    # All possible keys to liwc_scores start on line 269
    # of the word_category_counter.py script
      liwc_scores = word_category_counter.score_text(text)
      negative_score = liwc_scores["Negative Emotion"]
      positive_score = liwc_scores["Positive Emotion"]

      if positive_score > negative_score:
          liwc_dict[key] = {"liwc:positive":1}
      else:
          liwc_dict[key] = {"liwc:negative":1}
  #add liwc feature to existing feature
  def add_liwc_to_frequency(row):
      if 'frequency' in row:
        row['frequency'].update(row['liwc'])
      else:
        row['frequency'] = row['liwc']

  review['liwc'] = pd.Series(liwc_dict).reset_index(drop=True)
  review.reset_index(drop=True)
  review.apply(add_liwc_to_frequency, axis=1)
  return review
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature
    :param words:
    :return:
    """
    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script

    negative_score = 0
    positive_score = 0

    # some values were too high so scalars are present
    negative_score += liwc_scores["Negative Emotion"] * 2
    negative_score += liwc_scores["Anger"]
    negative_score += liwc_scores["Anxiety"]
    negative_score += liwc_scores["Sadness"]
    negative_score += liwc_scores["Metaphysical issues"]
    negative_score += liwc_scores["Death"]

    positive_score += liwc_scores["Positive Emotion"] / 6
    positive_score += liwc_scores["Optimism and energy"]
    positive_score += liwc_scores["Achievement"]
    positive_score += liwc_scores["Future Tense"]

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    return feature_vectors
Пример #13
0
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)
    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    #negative_score = liwc_scores["Negative Emotion"]
    #positive_score = liwc_scores["Positive Emotion"]
    #perception_score = liwc_scores["Perceptual Processes"]
    #sadness_score = liwc_scores["Sadness"]
    #cogmech_score = liwc_scores["Cognitive Processes"]
    #motion_score = liwc_scores["Motion"]

    #feature_vector["liwc:neg"] = liwc_scores["Negative Emotion"]
    #feature_vector["liwc:pos"] = liwc_scores["Positive Emotion"]
    #feature_vector["liwc:perceive"] = liwc_scores["Perceptual Processes"]
    #feature_vector["liwc:sad"] = liwc_scores["Sadness"]
    #feature_vector["liwc:cogmech"] = liwc_scores["Cognitive Processes"]
    #feature_vector["liwc_motion"] = liwc_scores["Motion"]

    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1
Пример #14
0
def get_liwc_scores(wc, rows, col_name):
    categories = set()
    all_scores = []
    for row in rows:
        liwc_scores = wc.score_text(row[col_name])
        categories |= set(liwc_scores.keys())
    category_list = sorted(list(categories))
    for row in rows:
        liwc_scores = wc.score_text(row[col_name])
        print(liwc_scores)
        all_scores += [
            [row[col_name]] +
            [liwc_scores.get(category, 0.0) for category in category_list]
        ]
        print(all_scores)
    return all_scores, category_list
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)
   
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1
Пример #16
0
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)
   
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1
Пример #17
0
    def apply_naive_bayes(row):
        sumOfClass1 = 0
        sumOfClass2 = 0
        if 'liwc' in row:
            liwc_scores = word_category_counter.score_text(row['text'])
            # liwc_dict[key] = liwc_scores
            negative_score = liwc_scores["Negative Emotion"]
            positive_score = liwc_scores["Positive Emotion"]
            if positive_score > negative_score:
                sumOfClass1 += bow[1]['liwc:positive']
            else:
                sumOfClass2 += bow[0]["liwc:negative"]

        if 'uni_tokens' in row:
            uni_tokens = row['uni_tokens']
            # algorithm
            for word in uni_tokens:
                if word in bow[1]:
                    sumOfClass1 += math.log(bow[1][word], 10)
                else:
                    sumOfClass1 += math.log(
                        1.0 / (uni_n_words_[1] + uni_unique_words), 10)

                if word in bow[0]:
                    sumOfClass2 += math.log(bow[0][word], 10)
                else:
                    sumOfClass2 += math.log(
                        1.0 / (uni_n_words_[0] + uni_unique_words), 10)
        if 'bi_tokens' in row:
            bi_tokens = row['bi_tokens']
            # algorithm
            for word in bi_tokens:
                if word in bow[1]:
                    sumOfClass1 += math.log(bow[1][word], 10)
                else:
                    sumOfClass1 += math.log(
                        1.0 / (bi_n_words_[1] + bi_unique_words), 10)

                if word in bow[0]:
                    sumOfClass2 += math.log(bow[0][word], 10)
                else:
                    sumOfClass2 += math.log(
                        1.0 / (bi_n_words_[0] + bi_unique_words), 10)
        # print A,B,C
        # classification
        if priorPortion + sumOfClass1 - sumOfClass2 > 0:
            t = 1
        else:
            t = 0
        return t
Пример #18
0
def get_liwc_features(words, binning=None):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """

    # TODO: binning

    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)
    liwc_keys = liwc_scores.keys()
    # print(liwc_keys)
    for target in liwc_keys:
        feature_vectors["liwc:{0}".format(target)] = bin(liwc_scores[target])

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    # negative_score = liwc_scores["Negative Emotion"]
    # positive_score = liwc_scores["Positive Emotion"]
    # fps_score = liwc_scores["First Person Singular"]
    # affective_process_score = liwc_scores["Affective Processes"]
    # perceptual_score = liwc_scores["Perceptual Processes"]
    # cognitive_score = liwc_scores["Cognitive Processes"]
    # anxiety_score = liwc_scores["Anxiety"]
    # anger_score = liwc_scores["Anger"]
    # health_score = liwc_scores["Health"]
    # leisure_score = liwc_scores["Leisure"]
    # time_score = liwc_scores["Time"]
    # certainty_score = liwc_scores["Certainty"]
    # discrepency_score = liwc_scores["Discrepency"]
    # communication_score = liwc_scores["Communication"]
    # inclusive_score = liwc_scores["Inclusive"]

    # feature_vectors["Negative Emotion"] = bin(negative_score)
    # feature_vectors["Positive Emotion"] = bin(positive_score)
    # feature_vectors["First Person Singular"] = bin(fps_score)
    # feature_vectors["Affective Processes"] = bin(affective_process_score)
    # feature_vectors["Anxiety"] = bin(anxiety_score)
    # feature_vectors["Anger"] = bin(anger_score)
    # feature_vectors["Time"] = bin(time_score)

    # if positive_score > negative_score:
    #     feature_vectors["liwc:positive"] = 1
    # else:
    #     feature_vectors["liwc:negative"] = 1

    return feature_vectors
Пример #19
0
def get_liwc_features(text):
    feature_vectors = {}
    liwc_scores = word_category_counter.score_text(text)
    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    feature_vectors["Negative Emotion"] = negative_score
    feature_vectors["Positive Emotion"] = positive_score
    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1
    return feature_vectors
Пример #20
0
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """
    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text, raw_counts=True)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    liwc_categories = word_category_counter.Dictionary._liwc_categories
    for long_name, _, _, _, _ in liwc_categories:
        val = int(liwc_scores[long_name])
        #feature_vectors["LIWC:{}".format(long_name.replace(" ", "-"))] = bin_liwc(val)
        feature_vectors["LIWC:{}".format(long_name.replace(" ", "-"))] = val
    return feature_vectors
Пример #21
0
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """

    # TODO: binning

    feature_vectors = {}
    newwords = []
    for word in words:
        if word:
            newwords.append(word)
    text = " ".join(newwords)
    liwc_scores = word_category_counter.score_text(text)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]
    anx_score = liwc_scores["Anxiety"]
    sad_score = liwc_scores["Sadness"]
    mad_score = liwc_scores["Anger"]
    cog_score = liwc_scores["Cognitive Processes"]
    per_score = liwc_scores["Perceptual Processes"]
    feature_vectors["Negative Emotion"] = negative_score
    feature_vectors["Positive Emotion"] = positive_score
    feature_vectors["Anxiety"] = anx_score
    feature_vectors["Sad"] = sad_score
    feature_vectors["Angry"] = mad_score
    feature_vectors["Thought"] = cog_score
    feature_vectors["Feel"] = per_score

    if positive_score > negative_score:
        feature_vectors["liwc:positive"] = 1
    else:
        feature_vectors["liwc:negative"] = 1

    return feature_vectors
Пример #22
0
def get_liwc_features(words):
    """
    Adds a simple LIWC derived feature

    :param words:
    :return:
    """

    feature_vectors = {}
    text = " ".join(words)
    liwc_scores = word_category_counter.score_text(text)

    # All possible keys to the scores start on line 269
    # of the word_category_counter.py script
    #negative_score = liwc_scores["Negative Emotion"]
    #positive_score = liwc_scores["Positive Emotion"]
    #feature_vectors["Negative Emotion"] = liwc_bin(negative_score)
    #feature_vectors["Positive Emotion"] = liwc_bin(positive_score)

    for tup in liwc_scores:
        feature_vectors["LIWC:" + tup] = liwc_bin(liwc_scores[tup])

    return feature_vectors
def add_liwc_features(text, feature_vector):
    liwc_scores = word_category_counter.score_text(text)
    
    #set 1 of liwc features  
    negative_score = liwc_scores["Negative Emotion"]
    positive_score = liwc_scores["Positive Emotion"]

    if positive_score > negative_score:
        feature_vector["liwc:positive"] = 1
    else:
        feature_vector["liwc:negative"] = 1
        
    feature_vector["liwc:anger" + "_" + str(bin(round(liwc_scores["Anger"])))] = 1
    feature_vector["liwc:optimism"+ "_" + str(bin(round(liwc_scores["Optimism and energy"])))] = 1
    feature_vector["liwc:Swear_Words"+ "_" + str(bin(round(liwc_scores["Swear Words"])))] = 1
    feature_vector["liwc:sad"+ "_" + str(bin(round(liwc_scores["Sadness"])))] = 1
    
    #set 2 of liwc features
    feature_vector["liwc:Negations" +"_" + str(bin(round(liwc_scores["Negations"])))] = 1    
    feature_vector["liwc:Family"+ "_" + str(bin(round(liwc_scores["Family"])))] = 1
    feature_vector["liwc:Friends"+ "_" + str(bin(round(liwc_scores["Friends"])))] = 1
    feature_vector["liwc:Anxiety"+ "_" + str(bin(round(liwc_scores["Anxiety"])))] = 1
    feature_vector["liwc:Feel"+ "_" + str(bin(round(liwc_scores["Feel"])))] = 1
    feature_vector["liwc:Positive feelings"+ "_" + str(bin(round(liwc_scores["Positive feelings"])))] = 1
Пример #24
0
sent_input = []
with open(sent_file, 'rU') as csvfile:
    label_reader = csv.reader(csvfile)
    for row in label_reader:
        sent_input += [row]

wc.load_dictionary(wc.default_dictionary_filename())

csv_op = [[
    "Filename", "Sentence", "Positive Emotion", "Negative Emotion", "Sadness",
    "Anger", "Anxiety"
]]
for pair in sent_input:
    name = pair[0]
    sentence = pair[1]
    liwc = wc.score_text(sentence)
    if liwc["Positive Emotion"] > liwc["Negative Emotion"]:
        x = 1
    elif liwc["Positive Emotion"] < liwc["Negative Emotion"]:
        x = -1
    else:
        x = 0

    csv_op += [[
        name, sentence, liwc["Positive Emotion"], liwc["Negative Emotion"],
        liwc["Sadness"], liwc["Anger"], liwc["Anxiety"], x
    ]]

b = open(sent_file[:-4] + "_LIWC_Emotions.csv", "w")
a = csv.writer(b)
a.writerows(csv_op)
def process_lyrics( artist_name ):

   filename = artist_name + "EXT.txt"
   with open(filename, 'r') as f:
      raw_songs = f.read()

   print "Processing: ", filename
   # Taking the title and credits out of the file
   # Title
   pat1 = re.compile(r'(.+?LYRICS)')
   head = pat1.search(raw_songs).expand(r'\1')

   pat2 = re.compile(r'(".+?")')
   match2 = pat2.search(head).expand(r'\1')
   
   extraneous_crap = head[len(match2):]
   
   clean_raw_songs = raw_songs.replace(extraneous_crap, '')
   
   # Credits
   pattern = re.compile(r'(Visit www\.azlyrics.*?Search)')
   credits = re.findall(pattern, raw_songs)
   
   for credit in credits:
      clean_raw_songs = clean_raw_songs.replace(credit, ' ')
   
   raw_songs = clean_raw_songs
      
   # New line characters were mistakenly removed in the scraping process,
   # but they were replaced with ' ' and there were two of them, so here 
   # we can use '  ' to split the lines, instead of .splitlines()
   raw_lines = raw_songs.split('  ')
   song_sents = [nltk.word_tokenize(line) for line in raw_lines]
   song_words = [word.lower() for sent in song_sents for word in sent]


   words = []
   stops = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
               'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
               'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
               'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
               'what', 'which', 'who', 'whom', 'this', 'that', 'these', 
               'those', 'am', 'is', 'are', 'were', 'be', 'been',
               'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
               'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because',
               'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 
               'about', 'between', 'into', 'through', 'during', 'before', 
               'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
               'out', 'on', 'over', 'under', 'further', 'then', 'once', 'here', 
               'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
               'each', 'few', 'other', 'some', 'such', 'only', 'own', 'same',
               'so', 'than', 'too', 'very', 's', 'can', 'will', 'just',
               'now', 'went', 'asked', 'was'] 
      
   words = song_words
   #for word in song_words:
   #   if len(re.findall(r'\w', word)) > 0:
   #      if word not in stops:
   #         words.append(word)


   #fdist = nltk.FreqDist(words)
   #print fdist.B()
   #print fdist.N()
   #print fdist.items()[:20], "\n\n"

   #bigram_measures = nltk.collocations.BigramAssocMeasures()
   #finder = BigramCollocationFinder.from_words(words)
   #finder.apply_freq_filter(3)
   #print finder.nbest(bigram_measures.pmi, 10), "\n\n"
   #print finder.ngram_fd.viewitems(), "\n\n"


   #bigrams = nltk.bigrams(words)
   #bfdist = nltk.FreqDist(bigrams)
   #print bfdist.items()[:20], "\n\n"

   liwc_scores = word_category_counter.score_text(raw_songs)
   normalized_liwc_scores = word_category_counter.normalize_scores(liwc_scores)
   
   outfile = "SCORES/" + artist_name + "SCORES.txt"
   with open(outfile, 'w') as outf:
      for name, value in normalized_liwc_scores.items():
         outf.write("{0}\n{1}\n".format(name, value))
   
   outfile = "SCORES/" + artist_name + "SCORES.pickle"
   with open(outfile, 'w') as outf:
      pickle.dump(normalized_liwc_scores, outf)