def add_vectors(self, a=[], b=[], log_tf_a = True, log_tf_b = True): if not b: b = SuperList() b.do_padding(new_len=a.__len__(), padding_data=0) elif a.__len__() != b.__len__(): if self.verbose: print "add_vectors:", a.__len__(), "!=", b.__len__() raise Exception sum_vector = SuperList() for i in range(0,a.__len__()): sum_vector.append(self.log_tf(a[i], do_nothing = not log_tf_a) + self.log_tf(b[i], do_nothing = not log_tf_b)) return sum_vector
def compare_queries(self, testing=True): return_value = [] queries_count = 0 if self.verbose: print "\nCalculating for %d queries" % len(self.queries) # Before doing any comparisons we need to convert the matrix to log_tf # Moved the below line to calculate_training_data() #self.matrix_to_log_tf() for query in self.queries: if self.verbose: queries_count += 1 if queries_count % (len(self.queries) / 5) == 0: print "- %d querues has been processed" % queries_count top_k_classes = SuperList() for doc in self.matrix: q_distance = self.calculate_vectors_distance( query['terms'], doc['terms']) item = {"class": doc['class'], "distance": q_distance} top_k_classes.populate_in_reverse_order( item, self._greater_than) if self.distance_metric == "euclid": top_k_classes.reverse() return_value.append( (query["class"], self.get_top_class(nearest_docs=top_k_classes, query_class=query["class"])[0])) return return_value
def calculate_proto_classes(self): vector_len = len(self.terms) sum_vector = SuperList() for doc in self.matrix: if self.proto_classes.has_key(doc['class']): # Updating values of existing proto-class with new doc, we only log_tf the newly added vector sum_vector = self.add_vectors( a=self.proto_classes[doc['class']]['log_tf'], b=doc['terms'], log_tf_a=False, log_tf_b=True) self.proto_classes[doc['class']] = { 'log_tf': sum_vector, 'docs_count': self.proto_classes[doc['class']]['docs_count'] + 1 } else: # First time to deal with the class, notice the add_vector will convert to log_tf by default sum_vector = self.add_vectors(a=doc['terms'], log_tf_a=True) self.proto_classes[doc['class']] = { 'log_tf': sum_vector, 'docs_count': 1 } for p_class in self.proto_classes.keys(): # Calculate centroid (proto-class) mean values self.proto_classes[p_class]['log_tf'] = self.divide_vector( self.proto_classes[p_class]['log_tf'], self.proto_classes[p_class]['docs_count'])
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = False): my_doc_terms = SuperList() for term in doc_terms: self.terms.unique_append(term) my_doc_terms.insert_after_padding(self.terms.index(term)) self.matrix.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding()
def add_query(self, query_id = '', query_class='n/a', query_terms=[]): my_query_terms = SuperList() my_query_terms.do_padding(new_len=len(self.terms), padding_data=0) new_terms_count = 0 for term in query_terms: try: my_query_terms.insert_after_padding(self.terms.index(term)) except: # Term not obtaied in traing phase new_terms_count += 1 self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms, 'new_terms_count': new_terms_count})
def add_query(self, query_id = '', query_class='n/a', query_terms=[]): my_query_terms = SuperList() my_query_terms.do_padding(new_len=len(self.terms), padding_data=0) for term in query_terms: try: my_query_terms.insert_after_padding(self.terms.index(term)) except: # Term not obtaied in traing phase, ignore it pass # Calling add_vectors to convert my_query_terms to log_tf values self.add_vectors(a=my_query_terms, log_tf_a = True) self.queries.append({'id': query_id, 'class': query_class, 'terms': my_query_terms})
def __init__(self, verbose=False, fold="n/a", config=object, ev=object): self.verbose = verbose self.fold = fold self.config = config self.config_data = config.get_configuration() self.distance_metric = self.config_data['distance_metric'] # Set k=0 now, let kNN reset it later on self.k = 0 # confugure the evaluation module self.ev = ev self.terms = SuperList() self.matrix = [] self.queries = [] if self.verbose: print "\nInitialization for fold %d done!" % int(fold)
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], do_padding = True): # If multivariant, remove multiple occurences of terms in document #print "Bayse >> add_doc", doc_terms if self.mode == 'm_variate': doc_terms = list(set(doc_terms)) #print doc_terms for term in doc_terms: self.terms.unique_append(term) # In case this is the first time to see this class if not self.m_matrix.has_key(doc_class): self.m_matrix[doc_class] = {'freq': SuperList(), 'total': 0, 'docs_count': 0} self.m_matrix[doc_class]['freq'].insert_after_padding(index=self.terms.index(term)) self.m_matrix[doc_class]['docs_count'] += 1 if do_padding: self.do_padding()
def _non_zero_indices(self, l): ret = SuperList() for i in range(0,len(l)): if l[i] != 0: ret.append(i) return ret
def divide_vector(self, vector=[], scalar=1): result = SuperList() for item in vector: result.append(float(item)/scalar) return result
def vector_log_tf(self, a=[], do_nothing=False): new_vector = SuperList() for i in range(0,a.__len__()): new_vector.append(self.log_tf(value=a[i], do_nothing=do_nothing)) return new_vector