def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]            
def mutual_information(trainset,corpus,num_top_words=500):
    ##3) Mutual Information - Feature Selection - including only those words as features which have the highest
     ##mutual information for a category - selecting top x words for a category
    ##A)Create a dictionary with a word as the key and a dictionary as the value
     ## in the dictionary the category as key and number of documents in that category where it occurs as value

    word_cat_num_doc_dict={}
    n = len(trainset)

    ##B)Loop through the reuters dataset, to get the entire text from  each file in the training set
    ##C) Parse the string to get individual words

    for file_name in trainset:
        list_words = get_list_tokens_nltk(corpus,file_name)
        cat = f2c(corpus,file_name)
    
    ##D) Update the dictionary
        for w in set(list_words):
            word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
            word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
            word_cat_num_doc_dict[w][cat]+=1


    ##E) Prepare a dictionary with key category and value as list of tuples of words with word strings  and
            #mutual information as the 2 elements of the tuple which will be the only ones considered as features
        ## and a list with all these word features
    word_features={}
    word_list=[]
    for w in word_cat_num_doc_dict.keys():
        dic = word_cat_num_doc_dict[w]
        n1x=0
        for cat in dic.keys():
            n1x+=dic[cat] ## number of documents in the training set where the word occurs
        for cat in dic.keys():
            word_features[cat]=word_features.get(cat,[])
            n11=dic[cat]
            mi=(n11/n)*log((n*n11)/(n1x*n1x))/log(2)
            if len(word_features[cat])<num_top_words:
                word_features[cat].append((w,mi))
                if len(word_features[cat])==num_top_words:
                    word_features[cat].sort(key=operator.itemgetter(1),reverse=True)                
            else:
                if word_features[cat][num_top_words-1][1]<mi:
                    word_features[cat][num_top_words-1]= (w,mi)
                    word_features[cat].sort(key=operator.itemgetter(1),reverse=True)                


    for cat in word_features.keys():
        #print cat
        #print word_features[cat]
        #print "\n"
        word_features[cat]=dict(word_features[cat])
        for w in word_features[cat]:
            word_list.append(w)
    
    return [word_features,set(word_list) ]
def gini(trainset,corpus,num_top_words=500):
    #The conditional probability of a word given a category is found by a technique very similar to
    #Multinomial Naive Bayes
    cat_word_dict={}
    cat_word_count_dict={}
    #val = my_dict.get(key, mydefaultval)
    word_list=[]
    word_features={}
    for file_name in trainset:
        list_words = get_list_tokens_nltk(corpus,file_name)
        cat = f2c(corpus,file_name)
        
        cat_word_dict[cat]=cat_word_dict.get(cat,{})
        cat_word_count_dict[cat]=cat_word_count_dict.get(cat,0)
        cat_word_count_dict[cat]+=len(list_words)

        for w in list_words:
            cat_word_dict[cat][w] = cat_word_dict[cat].get(w,0)
            cat_word_dict[cat][w]+= 1
        
    vocab_length=0            
    for dic in cat_word_dict.values():
        vocab_length+=len(dic)


    for cat in cat_word_dict:
        count_cat = cat_word_count_dict[cat]
        word_features[cat]=word_features.get(cat,[])
        for w in cat_word_dict[cat]:
            cond_prob=(cat_word_dict[cat][w]+1)/(count_cat+vocab_length)
            sum_cp=cond_prob
            for cate in cat_word_dict:
                if cate!=cat:
                    sum_cp+=((cat_word_dict[cate].get(w,0)+1)/(cat_word_count_dict[cate]+vocab_length))

            gini_coef = cond_prob/sum_cp
            if len(word_features[cat])<num_top_words:
                word_features[cat].append((w,gini_coef))
                if len(word_features[cat])==num_top_words:
                    word_features[cat].sort(key=operator.itemgetter(1),reverse=True)                
            else:
                if word_features[cat][num_top_words-1][1]<gini_coef:
                    word_features[cat][num_top_words-1]= (w,gini_coef)
                    word_features[cat].sort(key=operator.itemgetter(1),reverse=True)  

    for cat in word_features.keys():
        #print cat
        #print word_features[cat]
        #print "\n"
        word_features[cat]=dict(word_features[cat])
        for w in word_features[cat]:
            word_list.append(w)
    
    return [word_features,set(word_list) ]
else:
    li = Feature_Selector.mutual_information(trainset,corpus)    
word_features = li[0]
word_list = li[1]
    
##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
          #b) a dictionary with a category as the key and the number of words in it as the value
cat_word_dict={}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)

##5)Loop through the training set, to get the entire text from  each file
##6) Parse the string to get individual words
for file_name in trainset:
    list_words = get_list_tokens_nltk(corpus,file_name)
    cat = f2c(corpus,file_name)
    list_words = [w for w in list_words if word_features[cat].get(w,-100000)!=-100000]
    #!!!!!!!!------Possible Improvement: Stemming--------------#


##7) Check if category exists in dictionary, if not, create an empty dictionary,
    #and put word count as zero
    #and then insert words into the category's dictionary in both cases and update the word count
    cat_word_dict[cat] = cat_word_dict.get(cat,{})
    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
    
 ##Update the dictionary - 2 possible ways
    ##A) loop over the set of words and update dictionary with log value
        ##Complexity- n(set)*n(count operation) = O(n^2)
    ##B) loop over list and update count for each occurence
        #at the end, loop over set and replace count with log value
Exemplo n.º 5
0
else:
    li = Feature_Selector.mutual_information(trainset, corpus)
word_features = li[0]
word_list = li[1]

##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
cat_word_dict = {}
cat_word_count_dict = {}
#val = my_dict.get(key, mydefaultval)

##5)Loop through the training set, to get the entire text from  each file
##6) Parse the string to get individual words
for file_name in trainset:
    list_words = get_list_tokens_nltk(corpus, file_name)
    cat = f2c(corpus, file_name)
    list_words = [
        w for w in list_words if word_features[cat].get(w, -100000) != -100000
    ]
    #!!!!!!!!------Possible Improvement: Stemming--------------#

    ##7) Check if category exists in dictionary, if not, create an empty dictionary,
    #and put word count as zero
    #and then insert words into the category's dictionary in both cases and update the word count
    cat_word_dict[cat] = cat_word_dict.get(cat, {})
    cat_word_count_dict[cat] = cat_word_count_dict.get(cat, 0)

    ##Update the dictionary - 2 possible ways
    ##A) loop over the set of words and update dictionary with log value
    ##Complexity- n(set)*n(count operation) = O(n^2)
    ##B) loop over list and update count for each occurence