def __init__(self, bad_files_num, good_files_num, ngram_size, top_ngrams_size): self._bad_files_num = bad_files_num self._good_files_num = good_files_num self.top_ngrams_size = top_ngrams_size self.ngram_size = ngram_size self._files_num = bad_files_num + good_files_num self._good_files_proportion = good_files_num / (bad_files_num + good_files_num) self._bad_files_proportion = bad_files_num / (bad_files_num + good_files_num) self._most_informative_ngrams = SortedCollection(key=itemgetter(1))
class Classifier(object): """ calculates information_gram(ig) of ngrams , and holding a sorted collection that holds all the ngrams with the highest igs """ GOOD_FILES_PROP = 0 BAD_FILES_PROP = 1 def __init__(self, bad_files_num, good_files_num, ngram_size, top_ngrams_size): self._bad_files_num = bad_files_num self._good_files_num = good_files_num self.top_ngrams_size = top_ngrams_size self.ngram_size = ngram_size self._files_num = bad_files_num + good_files_num self._good_files_proportion = good_files_num / (bad_files_num + good_files_num) self._bad_files_proportion = bad_files_num / (bad_files_num + good_files_num) self._most_informative_ngrams = SortedCollection(key=itemgetter(1)) @staticmethod def _calculate_element(p_vj_ci, p_vj, p_ci): """ calculates one element in caclculating total ig of a ngram """ if p_ci == 0 or p_vj == 0 or p_vj_ci == 0: return 0 return p_vj_ci * log((p_vj_ci / (p_vj * p_ci)), 10) def _get_ngram_ig(self, ngram): """ calculates the ig of given ngram according to ig formula. """ app_in_good_files = self._calculate_element(ngram.good_prop, ngram.proportion, self._good_files_proportion) app_in_bad_files = self._calculate_element(ngram.bad_prop, ngram.proportion, self._bad_files_proportion) absence_from_good_files = self._calculate_element((1 - ngram.good_prop), (1 - ngram.proportion), self._good_files_proportion) absence_from_bad_files = self._calculate_element((1 - ngram.bad_prop), (1 - ngram.proportion), self._bad_files_proportion) ig = app_in_bad_files + app_in_good_files + absence_from_bad_files + absence_from_good_files return ig def _add_ngram_to_collection(self, ngram_item): """ adding given ngram to the sorted collection """ self._most_informative_ngrams.insert(ngram_item) size = self._most_informative_ngrams.get_size() if size > self.top_ngrams_size: #remove the ngram with worst ig from collection self._most_informative_ngrams.remove(self._most_informative_ngrams[self.top_ngrams_size]) #pprint(list(self._most_informative_ngrams)) def add_new_ngram(self, ngram_str, good_appearances, bad_appearances): """ create a new ngram and add it to collection """ good_prop = good_appearances / self._good_files_num bad_prop = bad_appearances / self._bad_files_num prop = (good_appearances + bad_appearances) / self._files_num ngram = Ngram(ngram_str, prop, good_prop, bad_prop) ngram.ig = self._get_ngram_ig(ngram) ngram_item = (ngram.ngram_str, 1 - ngram.ig) self._add_ngram_to_collection(ngram_item) def get_most_informative_ngrams(self): """ when we finish - return ngrams with best igs """ classified_ngrams = [ngram[0] for ngram in list(self._most_informative_ngrams)] return classified_ngrams