def frequency_selector(): dumpTwittes('sentiment.csv', 'positive_twittes_dump.csv', 'positive') dumpTwittes('sentiment.csv', 'negative_twittes_dump.csv', 'negative') dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'neutral') dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'irrelevant') positive_words = dict(count_freq('positive_twittes_dump.csv', 1000)) negative_words = dict(count_freq('negative_twittes_dump.csv', 1000)) neutral_words = dict(count_freq('neutral_twittes_dump.csv', 1000)) popular_control = 50 threshold = 100 positive_features = {} for word in positive_words: if (word in negative_words and (negative_words[word] > popular_control or positive_words[word] < negative_words[word] * threshold)) \ or (word in neutral_words and (neutral_words[word] > popular_control or positive_words[word] < neutral_words[word] * threshold)): continue positive_features[word] = positive_words[word] positive_features = sorted(positive_features.iteritems(), key=lambda (word, count): (-count, word))[:100] # print positive_features negative_features = {} for word in negative_words: if (word in positive_words and (positive_words[word] > popular_control or negative_words[word] < positive_words[word] * threshold)) \ or (word in neutral_words and (neutral_words[word] > popular_control or negative_words[word] < neutral_words[word] * threshold)): continue negative_features[word] = negative_words[word] negative_features = sorted(negative_features.iteritems(), key=lambda (word, count): (-count, word))[:100] neutral_features = {} for word in neutral_words: if (word in negative_words and (negative_words[word] > popular_control or neutral_words[word] < negative_words[word] * threshold)) \ or (word in positive_words and (positive_words[word] > popular_control or neutral_words[word] < positive_words[word] * threshold)): continue neutral_features[word] = neutral_words[word] neutral_features = sorted(neutral_features.iteritems(), key=lambda (word, count): (-count, word))[:100] spamWriter = csv.writer(open('postive_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***postive***', '***MI***']) for word, freq in positive_features: spamWriter.writerow([word, freq]) spamWriter = csv.writer(open('negative_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***negative***', '***MI***']) for word, freq in negative_features: spamWriter.writerow([word, freq]) spamWriter = csv.writer(open('neutral_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***neutral***', '***MI***']) for word, freq in neutral_features: spamWriter.writerow([word, freq])
def frequency_selector(): dumpTwittes('sentiment.csv', 'positive_twittes_dump.csv', 'positive') dumpTwittes('sentiment.csv', 'negative_twittes_dump.csv', 'negative') dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'neutral') dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'irrelevant') positive_words = dict(count_freq('positive_twittes_dump.csv', 1000)) negative_words = dict(count_freq('negative_twittes_dump.csv', 1000)) neutral_words = dict(count_freq('neutral_twittes_dump.csv', 1000)) popular_control = 50 threshold = 100 positive_features = {} for word in positive_words: if (word in negative_words and (negative_words[word] > popular_control or positive_words[word] < negative_words[word] * threshold)) \ or (word in neutral_words and (neutral_words[word] > popular_control or positive_words[word] < neutral_words[word] * threshold)): continue positive_features[word] = positive_words[word] positive_features = sorted(positive_features.iteritems(), key=lambda(word, count): (-count, word))[:100] # print positive_features negative_features = {} for word in negative_words: if (word in positive_words and (positive_words[word] > popular_control or negative_words[word] < positive_words[word] * threshold)) \ or (word in neutral_words and (neutral_words[word] > popular_control or negative_words[word] < neutral_words[word] * threshold)): continue negative_features[word] = negative_words[word] negative_features = sorted(negative_features.iteritems(), key=lambda(word, count): (-count, word))[:100] neutral_features = {} for word in neutral_words: if (word in negative_words and (negative_words[word] > popular_control or neutral_words[word] < negative_words[word] * threshold)) \ or (word in positive_words and (positive_words[word] > popular_control or neutral_words[word] < positive_words[word] * threshold)): continue neutral_features[word] = neutral_words[word] neutral_features = sorted(neutral_features.iteritems(), key=lambda(word, count): (-count, word))[:100] spamWriter = csv.writer(open('postive_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***postive***', '***MI***']) for word, freq in positive_features: spamWriter.writerow([word, freq]) spamWriter = csv.writer(open('negative_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***negative***', '***MI***']) for word, freq in negative_features: spamWriter.writerow([word, freq]) spamWriter = csv.writer(open('neutral_features.csv', 'w'), delimiter=',', quotechar='"') spamWriter.writerow(['***neutral***', '***MI***']) for word, freq in neutral_features: spamWriter.writerow([word, freq])
def feature_selection(category, frequency, stat): top_words = count_freq('sentiment.csv', frequency) print '\n\n\n' print top_words print '\n\n\n' mi_result = {} mi_result = defaultdict(int) for word, frequency in top_words: result = mi_oneword(word, category, stat) if result > 0: mi_result[word] = result # if(stat==MI): # if(mi != -11 and mi != -10 and mi != 0 and mi != -1): # mi_result[word] = mi; # elif(stat==CHI): # if(mi !=-12 and mi != -21 and mi != -2 and mi != -20): # mi_result[word] = mi; print word, ':', mi_result[word] #print mi_oneword('the', 'positive') feature = {} feature = sorted(mi_result.iteritems(), key=lambda (word, count): (-count, word))[:len(mi_result) - 1] return feature
def feature_selection(category, frequency, stat): top_words = count_freq.count_freq('sentiment.csv', frequency) print '\n\n\n' print top_words print '\n\n\n' mi_result = {} mi_result = defaultdict(int) for word, frequency in top_words: result = mi_oneword(word, category, stat) if result > 0: mi_result[word] = result # if(stat==MI): # if(mi != -11 and mi != -10 and mi != 0 and mi != -1): # mi_result[word] = mi; # elif(stat==CHI): # if(mi !=-12 and mi != -21 and mi != -2 and mi != -20): # mi_result[word] = mi; print word, ':', mi_result[word] #print mi_oneword('the', 'positive') feature = {} feature = sorted(mi_result.iteritems(), key=lambda(word, count) : (-count, word))[:len(mi_result) - 1] return feature