def create_with_ponderation_tf_idf(self, index, compute_norm=True): N = len(index) reverse_index = Reverse_index(self.index_type) reverse_index.idf = self.create_idf_counter(index) reverse_index.other_infos['norms'] = defaultdict( lambda: defaultdict(float)) id_full_list = [] for (document_id, tf_counter) in index: for term in tf_counter: tf_idf_ponderation = ( 1 + self.custom_log(tf_counter[term])) * log10( float(N) / reverse_index.idf[term]) reverse_index.add_entry(term, document_id, tf_idf_ponderation) id_full_list.append(document_id) if compute_norm: reverse_index.other_infos['norms'][document_id][ 'linear'] += tf_idf_ponderation reverse_index.other_infos['norms'][document_id][ 'quadratic'] += tf_idf_ponderation * tf_idf_ponderation reverse_index.set_id_set(set(id_full_list)) return reverse_index
def create_with_ponderation_normal_frequency(self, index): # w = tf / max_document (tf) reverse_index = Reverse_index(self.index_type) reverse_index.idf = self.create_idf_counter(index) reverse_index.other_infos['norms'] = defaultdict( lambda: defaultdict(float)) id_full_list = [] max_frequency_in_document = defaultdict(int) # First, create unnormalized reverse index... for (document_id, tf_counter) in index: for term in tf_counter: tf_ponderation = tf_counter[term] reverse_index.add_entry(term, document_id, tf_ponderation) max_frequency_in_document[document_id] = max( max_frequency_in_document[document_id], tf_ponderation) id_full_list.append(document_id) # Then, normalize each term by the maximum frequency occurence in the document for word in reverse_index.get_all_words(): for document_id in reverse_index.get_entry(word): reverse_index.get_entry( word)[document_id] = reverse_index.get_entry( word)[document_id] / float( max_frequency_in_document[document_id]) reverse_index.other_infos['norms'][document_id][ 'linear'] += tf_ponderation reverse_index.other_infos['norms'][document_id][ 'quadratic'] += tf_ponderation * tf_ponderation reverse_index.set_id_set(set(id_full_list)) return reverse_index
def create_with_ponderation_normal_frequency(self, index): # w = tf / max_document (tf) reverse_index = Reverse_index(self.index_type) reverse_index.idf = self.create_idf_counter(index) reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float)) id_full_list = [] max_frequency_in_document = defaultdict(int) # First, create unnormalized reverse index... for (document_id, tf_counter) in index: for term in tf_counter: tf_ponderation = tf_counter[term] reverse_index.add_entry(term, document_id, tf_ponderation) max_frequency_in_document[document_id] = max(max_frequency_in_document[document_id], tf_ponderation) id_full_list.append(document_id) # Then, normalize each term by the maximum frequency occurence in the document for word in reverse_index.get_all_words(): for document_id in reverse_index.get_entry(word): reverse_index.get_entry(word)[document_id] = reverse_index.get_entry(word)[document_id] / float(max_frequency_in_document[document_id]) reverse_index.other_infos['norms'][document_id]['linear'] += tf_ponderation reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_ponderation * tf_ponderation reverse_index.set_id_set(set(id_full_list)) return reverse_index
def create_with_ponderation_tf_idf(self, index, compute_norm=True): N = len(index) reverse_index = Reverse_index(self.index_type) reverse_index.idf = self.create_idf_counter(index) reverse_index.other_infos['norms'] = defaultdict(lambda: defaultdict(float)) id_full_list = [] for (document_id, tf_counter) in index: for term in tf_counter: tf_idf_ponderation = (1 + self.custom_log(tf_counter[term])) * log10(float(N) / reverse_index.idf[term]) reverse_index.add_entry(term, document_id, tf_idf_ponderation) id_full_list.append(document_id) if compute_norm: reverse_index.other_infos['norms'][document_id]['linear'] += tf_idf_ponderation reverse_index.other_infos['norms'][document_id]['quadratic'] += tf_idf_ponderation * tf_idf_ponderation reverse_index.set_id_set(set(id_full_list)) return reverse_index