Пример #1
0
def frequency_selector():
    dumpTwittes('sentiment.csv', 'positive_twittes_dump.csv', 'positive')
    dumpTwittes('sentiment.csv', 'negative_twittes_dump.csv', 'negative')
    dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'neutral')
    dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'irrelevant')

    positive_words = dict(count_freq('positive_twittes_dump.csv', 1000))
    negative_words = dict(count_freq('negative_twittes_dump.csv', 1000))
    neutral_words = dict(count_freq('neutral_twittes_dump.csv', 1000))

    popular_control = 50
    threshold = 100

    positive_features = {}
    for word in positive_words:
        if (word in negative_words and (negative_words[word] > popular_control or positive_words[word] < negative_words[word] * threshold)) \
            or (word in neutral_words and (neutral_words[word] > popular_control or positive_words[word] < neutral_words[word] * threshold)):
            continue
        positive_features[word] = positive_words[word]
    positive_features = sorted(positive_features.iteritems(),
                               key=lambda (word, count): (-count, word))[:100]
    #    print positive_features

    negative_features = {}
    for word in negative_words:
        if (word in positive_words and (positive_words[word] > popular_control or negative_words[word] < positive_words[word] * threshold)) \
            or (word in neutral_words and (neutral_words[word] > popular_control or negative_words[word] < neutral_words[word] * threshold)):
            continue
        negative_features[word] = negative_words[word]
    negative_features = sorted(negative_features.iteritems(),
                               key=lambda (word, count): (-count, word))[:100]

    neutral_features = {}
    for word in neutral_words:
        if (word in negative_words and (negative_words[word] > popular_control or neutral_words[word] < negative_words[word] * threshold)) \
            or (word in positive_words and (positive_words[word] > popular_control or neutral_words[word] < positive_words[word] * threshold)):
            continue
        neutral_features[word] = neutral_words[word]
    neutral_features = sorted(neutral_features.iteritems(),
                              key=lambda (word, count): (-count, word))[:100]

    spamWriter = csv.writer(open('postive_features.csv', 'w'),
                            delimiter=',',
                            quotechar='"')
    spamWriter.writerow(['***postive***', '***MI***'])
    for word, freq in positive_features:
        spamWriter.writerow([word, freq])
    spamWriter = csv.writer(open('negative_features.csv', 'w'),
                            delimiter=',',
                            quotechar='"')
    spamWriter.writerow(['***negative***', '***MI***'])
    for word, freq in negative_features:
        spamWriter.writerow([word, freq])
    spamWriter = csv.writer(open('neutral_features.csv', 'w'),
                            delimiter=',',
                            quotechar='"')
    spamWriter.writerow(['***neutral***', '***MI***'])
    for word, freq in neutral_features:
        spamWriter.writerow([word, freq])
def frequency_selector():
    dumpTwittes('sentiment.csv', 'positive_twittes_dump.csv', 'positive')
    dumpTwittes('sentiment.csv', 'negative_twittes_dump.csv', 'negative')
    dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'neutral')
    dumpTwittes('sentiment.csv', 'neutral_twittes_dump.csv', 'irrelevant')
    
    positive_words = dict(count_freq('positive_twittes_dump.csv', 1000))
    negative_words = dict(count_freq('negative_twittes_dump.csv', 1000))
    neutral_words = dict(count_freq('neutral_twittes_dump.csv', 1000))
    
    popular_control = 50
    threshold = 100
    
    positive_features = {}
    for word in positive_words:
        if (word in negative_words and (negative_words[word] > popular_control or positive_words[word] < negative_words[word] * threshold)) \
            or (word in neutral_words and (neutral_words[word] > popular_control or positive_words[word] < neutral_words[word] * threshold)):
            continue
        positive_features[word] = positive_words[word]
    positive_features = sorted(positive_features.iteritems(),
                   key=lambda(word, count): (-count, word))[:100] 
#    print positive_features
    
    negative_features = {}
    for word in negative_words:
        if (word in positive_words and (positive_words[word] > popular_control or negative_words[word] < positive_words[word] * threshold)) \
            or (word in neutral_words and (neutral_words[word] > popular_control or negative_words[word] < neutral_words[word] * threshold)):
            continue
        negative_features[word] = negative_words[word]
    negative_features = sorted(negative_features.iteritems(),
                   key=lambda(word, count): (-count, word))[:100]
    
    neutral_features = {}
    for word in neutral_words:
        if (word in negative_words and (negative_words[word] > popular_control or neutral_words[word] < negative_words[word] * threshold)) \
            or (word in positive_words and (positive_words[word] > popular_control or neutral_words[word] < positive_words[word] * threshold)):
            continue
        neutral_features[word] = neutral_words[word]
    neutral_features = sorted(neutral_features.iteritems(),
                   key=lambda(word, count): (-count, word))[:100]
    
    spamWriter = csv.writer(open('postive_features.csv', 'w'), delimiter=',', quotechar='"')
    spamWriter.writerow(['***postive***', '***MI***'])
    for word, freq in positive_features:
        spamWriter.writerow([word, freq])
    spamWriter = csv.writer(open('negative_features.csv', 'w'), delimiter=',', quotechar='"')
    spamWriter.writerow(['***negative***', '***MI***'])
    for word, freq in negative_features:
        spamWriter.writerow([word, freq])
    spamWriter = csv.writer(open('neutral_features.csv', 'w'), delimiter=',', quotechar='"')
    spamWriter.writerow(['***neutral***', '***MI***'])
    for word, freq in neutral_features:
        spamWriter.writerow([word, freq])
Пример #3
0
def feature_selection(category, frequency, stat):

    top_words = count_freq('sentiment.csv', frequency)
    print '\n\n\n'
    print top_words
    print '\n\n\n'

    mi_result = {}
    mi_result = defaultdict(int)
    for word, frequency in top_words:
        result = mi_oneword(word, category, stat)
        if result > 0:
            mi_result[word] = result
            # if(stat==MI):
            #     if(mi != -11 and mi != -10 and mi != 0 and mi != -1):
            #         mi_result[word] = mi;
            # elif(stat==CHI):
            #     if(mi !=-12 and mi != -21 and mi != -2 and mi != -20):
            #         mi_result[word] = mi;
            print word, ':', mi_result[word]
    #print mi_oneword('the', 'positive')
    feature = {}
    feature = sorted(mi_result.iteritems(),
                     key=lambda (word, count):
                     (-count, word))[:len(mi_result) - 1]
    return feature
def feature_selection(category, frequency, stat):

    top_words = count_freq.count_freq('sentiment.csv', frequency)
    print '\n\n\n'
    print top_words
    print '\n\n\n'
    
    mi_result = {}
    mi_result = defaultdict(int)
    for word, frequency in top_words:
        result = mi_oneword(word, category, stat)
        if result > 0:
            mi_result[word] = result
            # if(stat==MI):
            #     if(mi != -11 and mi != -10 and mi != 0 and mi != -1):
            #         mi_result[word] = mi;
            # elif(stat==CHI):
            #     if(mi !=-12 and mi != -21 and mi != -2 and mi != -20):
            #         mi_result[word] = mi;
            print word, ':', mi_result[word] 
    #print mi_oneword('the', 'positive')
    feature = {}
    feature = sorted(mi_result.iteritems(), key=lambda(word, count) : (-count, word))[:len(mi_result) - 1]
    return feature