classdoc_ids = numpy.nonzero(train_labels[:, i])[0].tolist() if len(classdoc_ids) == 0: term_freq[i] = {term_freq[i-1].keys()[0]: 0} continue term_freq[i] = get_TF(vectorizer_tf, vectorised_train_documents_tf, classdoc_ids) map(lambda (k, v): add2dict(k, v, all_terms), term_freq[i].items()) totterms += sum(term_freq[i].values()) all_terms_prob = {k: v/totterms for k, v in all_terms.items()} #Devide features by class-complements for i in range(num_labels): classdoc_ids = numpy.nonzero(train_labels_complement[:, i])[0].tolist() term_freq[num_labels + i] = get_TF(vectorizer_tf, vectorised_train_documents_tf, classdoc_ids) #Convert to Probability & Perform Jelinek-Mercer Smoothing if weight == "tf" or cor_type == "P": for i in range(num_labels): term_prob[i] = freqToProbability(term_freq[i], term_freq[num_labels + i], all_terms_prob, lamda) term_prob[num_labels + i] = freqToProbability(term_freq[num_labels + i], term_freq[i], all_terms_prob, lamda) vocab_choice = term_prob #Clear memory for unused variables all_terms_list = all_terms.keys() all_terms = {}; all_terms_prob = {}; vectorised_train_documents_tf = [] print "Generating term-weights complete and it took : ", print_time(start_time) start_time = time.time() #Find cooccurences for all classes and complements if cor_type == "J": cooccurences_by_class = cooccurence_main.get_cooccurences_BR(train_labels, train_docs, P_AandB=False) elif cor_type == "P":
classdoc_ids = numpy.nonzero(train_labels[:, i])[0].tolist() term_freq[i] = get_TF(vectorizer_tf, vectorised_train_documents_tf, classdoc_ids) map(lambda (k, v): add2dict(k, v, all_term), term_freq[i].items()) tot_freq += sum(term_freq[i].values()) all_term = {k: v/tot_freq for k, v in all_term.items()} #Devide features by class-complements compl_term_freq = {} for i in range(train_labels.shape[1]): classdoc_ids = numpy.nonzero(train_labels_complement[:, i])[0].tolist() compl_term_freq[i] = get_TF(vectorizer_tf, vectorised_train_documents_tf, classdoc_ids) #Convert to Probability & Perform Jelinek-Mercer Smoothing term_prob = {} if weight == "tf" or cor_type == "P": for i in range(train_labels.shape[1]): term_prob[i] = freqToProbability(term_freq[i], compl_term_freq[i], all_term, lamda) vocab_choice = term_prob #Clear memory for unused variables all_terms_list = all_term.keys() all_term = {}; vectorised_train_documents_tf = [] print "Generating term-weights complete and it took : ", print_time(start_time) start_time = time.time() #Find cooccurences for all classes if cor_type == "J": cooccurences_by_class = cooccurence_main.get_cooccurences(train_labels, train_docs, P_AandB=False) elif cor_type == "P":