Exemplo n.º 1
0
    total += vocabulary_size*ALPHA
    #End Smoothing
    return float(count)/total

def total(bag):
    total = 0
    #return the total count of words in a bag
    for value in bag.values():
        total += value
    return total;    

####################################

#Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words
wordTokenizer = WordTokenizer()
words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY)
training_document_bag_dictionary = {}
for key, token_list in words_in_training.iteritems():
    bag_of_words = defaultdict(int)
    for word in token_list:
        bag_of_words[word]+=1
    training_document_bag_dictionary[key] = bag_of_words

training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0]))

#print training_document_bag_dictionary   

positive_training_document_bag_dictionary = {}
negative_training_document_bag_dictionary = {}
objective_training_document_bag_dictionary = {}
#######Constants###########
POSITIVE_POLARITY = "positive"
NEGATIVE_POLARITY = "negative"
OBJECTIVE_POLARITY = "neutral"
POSITIVE_POLARITY_FOR_SCORER = "POS"
NEGATIVE_POLARITY_FOR_SCORER = "NEG"
OBJECTIVE_POLARITY_FOR_SCORER = "OBJ"
GENERATED_FILES_DIRECTORY = "../../generated_files/"
tuning_parameter = 6
#####End of Constants#####


classificationSentimentDictionary = SentimentParser().wordsClassification()
#print classificationSentimentDictionary
tokenizer = WordTokenizer()
fileDictionary = tokenizer.tokenizeDirectoryAsDictionary("dev")
fileDictionary = OrderedDict(sorted(fileDictionary.items(), key=lambda t: t[0]))
fileClassDictionary = {}
for key, value_as_list in fileDictionary.iteritems():
    positive = 0
    negative = 0
    for word in value_as_list:
        if word in classificationSentimentDictionary:
            word_sense_list = classificationSentimentDictionary[word]
            word_sense = word_sense_list[random.randint(0, len(word_sense_list)-1)]
            if word_sense == POSITIVE_POLARITY:
                positive+=1
            elif word_sense == NEGATIVE_POLARITY:
                negative+=1
    abstotal = abs(positive - negative)
    if abstotal <= tuning_parameter:
def minmaxSmoothing(power):
    alpha = math.pow(10, power)
    #Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words
    wordTokenizer = WordTokenizer()
    words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY)
    training_document_bag_dictionary = {}
    for key, token_list in words_in_training.iteritems():
        bag_of_words = defaultdict(int)
        for word in token_list:
            bag_of_words[word]+=1
        training_document_bag_dictionary[key] = bag_of_words
    
    training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0]))
    
    #print training_document_bag_dictionary   
    
    positive_training_document_bag_dictionary = {}
    negative_training_document_bag_dictionary = {}
    objective_training_document_bag_dictionary = {}
    
    total_positive = 0
    total_negative = 0
    total_objective = 0
    
    for line in fileinput.input([TRAIN_KEY_FILE]):
        temp_line = line.split(" ")
        fileName = temp_line[0]
        sense = temp_line[1].strip()
        fileName = "../../"+fileName #Make it into a suitable format
        if sense == POSITIVE_POLARITY_FOR_SCORER:   #document is a positive one 
            #Add the fileName -> bag-of-words(fileName) to a dict
            positive_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName]
            total_positive += total(training_document_bag_dictionary[fileName])
        elif sense == NEGATIVE_POLARITY_FOR_SCORER:
            negative_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName]
            total_negative += total(training_document_bag_dictionary[fileName])
        elif sense == OBJECTIVE_POLARITY_FOR_SCORER:
            objective_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName]
            total_objective += total(training_document_bag_dictionary[fileName])
        
    training_total_documents = len(training_document_bag_dictionary)
    training_total_positive_documents = len(positive_training_document_bag_dictionary)
    training_total_negative_documents = len(negative_training_document_bag_dictionary)
    training_total_objective_documents = len(objective_training_document_bag_dictionary)
    
    probability_positive = float(training_total_positive_documents)/ training_total_documents
    probability_negative = float(training_total_negative_documents)/ training_total_documents
    probability_objective = float(training_total_objective_documents)/ training_total_documents
    
    log_positive = math.log(probability_positive)
    log_negative = math.log(probability_negative)
    log_objective = math.log(probability_objective)
    
    words_in_development = wordTokenizer.tokenizeDirectoryAsDictionary(DEVELOPMENT_DATA_DIRECTORY)
    words_in_development = OrderedDict(sorted(words_in_development.items(), key=lambda t: t[0]))
    
    total_count = total_positive + total_negative + total_objective
    
    responseFile = GENERATED_FILES_DIRECTORY+"smoothing_response_MIN_MAX"+str(power)
    f = open(responseFile, "w")
    
    for key, token_list in words_in_development.iteritems():
        sentiment = ""
        p = 0
        n = 0
        o = 0
        #Step 2: P(POS|Document) P(NEG|Document} P(OBJ|Document)
        for word in token_list:
            #Step 2.1
            #P(k| POS) P(k| NEG) P(k| OBJ) for each word in this document
            
            probability_word_positive_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, positive_training_document_bag_dictionary, total_count, alpha)
            log_probability_positive = math.log(probability_word_positive_documents) if probability_word_positive_documents > 0 else NEG_INFINITY
            p+=log_probability_positive
            
            probability_word_negative_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, negative_training_document_bag_dictionary, total_count, alpha)
            log_probability_negative = math.log(probability_word_negative_documents) if probability_word_negative_documents > 0 else NEG_INFINITY
            n+=log_probability_negative
            
            probability_word_objective_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, objective_training_document_bag_dictionary, total_count, alpha)
            log_probability_objective = math.log(probability_word_objective_documents) if probability_word_objective_documents > 0 else NEG_INFINITY
            o+=log_probability_objective
        #Step 2.2: P(C|Document) = P(C) + all the P(k|C) --> Here in log form
        p+=log_positive
        n+=log_negative
        o+=log_objective
        
        
        if p > n:
            if p > o:
                sentiment = POSITIVE_POLARITY_FOR_SCORER
                theta = p
            else:
                sentiment = OBJECTIVE_POLARITY_FOR_SCORER
                theta = o
        else:
            if n > o:
                sentiment = NEGATIVE_POLARITY_FOR_SCORER
                theta = n
            else:
                sentiment = OBJECTIVE_POLARITY_FOR_SCORER
                theta = o
        
        print "Document Name: "+key+" is classified as: "+sentiment
        f.write(sentiment+"\t"+str(theta)+"\n")
    f.close()
Exemplo n.º 4
0
def files_having_word(files_words_bag, word):
    count = 0
    length_square = 0.0
    length = 0
    for this_bag in files_words_bag.values():
        if word in this_bag:
            count+=1
            length_square += math.pow(this_bag[word], 2)
            length += this_bag[word]
    return count, float(math.sqrt(length_square)), length 
'''
####################################

#Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words
wordTokenizer = WordTokenizer()
words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY)
training_document_bag_dictionary = {}
distinct_tokens = defaultdict(int)
for key, token_list in words_in_training.iteritems():
    bag_of_words = defaultdict(int)
    for word in token_list:
        bag_of_words[word]+=1
        distinct_tokens[word]+=1
    training_document_bag_dictionary[key] = bag_of_words

training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0]))

#print training_document_bag_dictionary   

positive_training_document_bag_dictionary = {}
negative_training_document_bag_dictionary = {}
#######Constants###########
POSITIVE_POLARITY = "positive"
NEGATIVE_POLARITY = "negative"
OBJECTIVE_POLARITY = "neutral"
POSITIVE_POLARITY_FOR_SCORER = "POS"
NEGATIVE_POLARITY_FOR_SCORER = "NEG"
OBJECTIVE_POLARITY_FOR_SCORER = "OBJ"
GENERATED_FILES_DIRECTORY = "../../generated_files/"
tuning_parameter = 10
#####End of Constants#####


classificationSentimentDictionary = SentimentParser().wordsClassification()
#print classificationSentimentDictionary
tokenizer = WordTokenizer()
fileDictionary = tokenizer.tokenizeDirectoryAsDictionary("train")
fileDictionary = OrderedDict(sorted(fileDictionary.items(), key=lambda t: t[0]))
fileClassDictionary = {}
for key, value_as_list in fileDictionary.iteritems():
    positive = 0
    negative = 0
    for word in value_as_list:
        if word in classificationSentimentDictionary:
            word_sense_list = classificationSentimentDictionary[word]
            word_sense = word_sense_list[random.randint(0, len(word_sense_list)-1)]
            if word_sense == POSITIVE_POLARITY:
                positive+=1
            elif word_sense == NEGATIVE_POLARITY:
                negative+=1
    abstotal = abs(positive - negative)
    if abstotal <= tuning_parameter: