def classify_problem(self): def __get_gp(x, item): sd = item['sd'] avg = item['avg'] coe = 1 / (math.sqrt(2 * math.pi) * sd) ind = (math.e)**(-((x-avg)*(x-avg)) / (2 * sd * sd)) return coe * ind logging("Start classify problems...", 0) for label in self.PROBLEM_VECTOR: max_cla = "" max_val = 0 for cla in self.CLASSIFY_AVG_SD: if cla == u'others_sub': continue cla_p = 1.0 for i in range(0, self.VEC_CNT): x = self.PROBLEM_VECTOR[label][i] try: p = __get_gp(x, self.CLASSIFY_AVG_SD[cla][i]) except Exception, ex: logging("label: %s, cla: %s, dx: %s, error: %s" %( label, cla, i), 2) if p == 0: p = 0.00001 cla_p *= p try: cla_p *= self.CLASSIFY_P[cla] except: cla_p = 0 if cla_p > max_val: max_val = cla_p max_cla = cla if label not in self.CLASSIFY_RES: self.CLASSIFY_RES[label] = [] self.CLASSIFY_RES[label].append(max_cla)
def naive_bayes_classifier(repo, classify_list, classify_p, knowledge, vector_file): logging("Problem Classify Program Start", 0) problem_classify = ProblemClassify(repo, classify_list, classify_p, knowledge, vector_file) problem_classify.read_problem_vector() problem_classify.calculate_knowledge_avg_sd() problem_classify.classify_problem() logging("Problem Classify Program Finish", 0) return problem_classify.CLASSIFY_RES
def get_user_aclist(self, step): logging("Get all users in %s" % (self.REPO, ), 0) users_list = [] with connection.cursor() as ues_con: sql = "SELECT `user` FROM `ues_ojstatus` where `repo`=%s group by `user`" ues_con.execute(sql, (self.REPO,)) users_tuple = ues_con.fetchall() users_list = list(user[0] for user in users_tuple) sql = "SET GLOBAL group_concat_max_len=%s" ues_con.execute(sql, (step*10000,)) users_tuple = ues_con.fetchone() logging("There are %d users in the %s DataBase" % (len(users_list), self.REPO), 0) return users_list
def read_problem_vector(self): logging("Start read the problem vector file...", 0) with open(self.PROBLEM_VECTOR_FILE, 'r') as file: self.PROBLEM_VECTOR = {} pro_cnt, vec_cnt = map(int, file.readline().split(" ")) self.VEC_CNT = vec_cnt for i in range(0, pro_cnt): raw_data = file.readline().split(" ") if(len(raw_data) == 1 and raw_data[0] == ""): continue self.PROBLEM_VECTOR[raw_data[0]] = map(float, raw_data[1:]) logging("Read the problem vector file finish", 0)
def calculate_knowledge_avg_sd(self): logging('Start calculate knowledge avg sd...', 0) for cla in self.KNOWLEDGE: self.CLASSIFY_AVG_SD[cla] = [] for i in range(0, self.VEC_CNT): cla_xi = [] for label in self.KNOWLEDGE[cla]: try: cla_xi.append(self.PROBLEM_VECTOR[label][i]) except Exception, ex: logging("cla:%s, label:%s, error:%s" %(cla, label, ex), 2) cla_xi = np.array(cla_xi) data = {'avg': np.mean(cla_xi), 'sd': np.std(cla_xi)} self.CLASSIFY_AVG_SD[cla].append(data)
def make_problem_vector(self): logging("Start make problem vector...", 0) sentences = word2vec.Text8Corpus(self.USER_ACLIST_FILE) # sentences = word2vec.LineSentence(self.USER_ACLIST_FILE) model = word2vec.Word2Vec( sentences = sentences, hs = 0, sg = 0, iter = 10, size = 300, window = 5, sample = 1e-4, ) model.wv.save_word2vec_format(fname=self.PROBLEM_VECTOR_FILE, binary=False) logging("Make problem vector finish", 0)
class ProblemClassify(object): def __init__(self, repo, classify_list, classify_p, knowledge, vector_file): super (ProblemClassify, self).__init__() self.REPO = repo self.VEC_CNT = 0 self.CLASSIFY_RES = {} self.PROBLEM_VECTOR = {} self.KNOWLEDGE = knowledge self.CLASSIFY_P = classify_p self.CLASSIFY_LIST = classify_list self.PROBLEM_VECTOR_FILE = vector_file # { # 'class1': [{ # 'avg': xxx, # 'sd' # }, # { # ... # } # ], # 'class2': [{ # } # ] # } self.CLASSIFY_AVG_SD = {} def read_problem_vector(self): logging("Start read the problem vector file...", 0) with open(self.PROBLEM_VECTOR_FILE, 'r') as file: self.PROBLEM_VECTOR = {} pro_cnt, vec_cnt = map(int, file.readline().split(" ")) self.VEC_CNT = vec_cnt for i in range(0, pro_cnt): raw_data = file.readline().split(" ") if(len(raw_data) == 1 and raw_data[0] == ""): continue self.PROBLEM_VECTOR[raw_data[0]] = map(float, raw_data[1:]) logging("Read the problem vector file finish", 0) def calculate_knowledge_avg_sd(self): logging('Start calculate knowledge avg sd...', 0) for cla in self.KNOWLEDGE: self.CLASSIFY_AVG_SD[cla] = [] for i in range(0, self.VEC_CNT): cla_xi = [] for label in self.KNOWLEDGE[cla]: try: cla_xi.append(self.PROBLEM_VECTOR[label][i]) except Exception, ex: logging("cla:%s, label:%s, error:%s" %(cla, label, ex), 2) cla_xi = np.array(cla_xi) data = {'avg': np.mean(cla_xi), 'sd': np.std(cla_xi)} self.CLASSIFY_AVG_SD[cla].append(data) logging('Calculate knowledge avg sd finish', 0)
def get_all_user_aclist(self): logging("Start get all user aclist...", 0) step = 1000 filename = os.path.join(TMP_CHDIR, '%s_label_list.txt'%(self.REPO, )) user_list = self.get_user_aclist(step) for i in range(0, len(user_list), step): users = user_list[i: i+step] users_label = self.fetch_user_lables(users) self.write_user_aclist(users_label, self.USER_ACLIST_FILE) logging('Users list %d--%d Done!'%(i, i+step), 0) logging("Get all user aclist finish", 0)
def make_problems_vector(repo, aclist_file, vector_file): logging("Make Problems Vectors Main Program Startup...", 0) problem_vector = ProblemVector(repo, aclist_file, vector_file) problem_vector.get_all_user_aclist() problem_vector.make_problem_vector() logging("Make Problems Vectors Finish", 0)
x = self.PROBLEM_VECTOR[label][i] try: p = __get_gp(x, self.CLASSIFY_AVG_SD[cla][i]) except Exception, ex: logging("label: %s, cla: %s, dx: %s, error: %s" %( label, cla, i), 2) if p == 0: p = 0.00001 cla_p *= p try: cla_p *= self.CLASSIFY_P[cla] except: cla_p = 0 if cla_p > max_val: max_val = cla_p max_cla = cla if label not in self.CLASSIFY_RES: self.CLASSIFY_RES[label] = [] self.CLASSIFY_RES[label].append(max_cla) # print self.CLASSIFY_RES logging("Classify problems finish", 0) def naive_bayes_classifier(repo, classify_list, classify_p, knowledge, vector_file): logging("Problem Classify Program Start", 0) problem_classify = ProblemClassify(repo, classify_list, classify_p, knowledge, vector_file) problem_classify.read_problem_vector() problem_classify.calculate_knowledge_avg_sd() problem_classify.classify_problem() logging("Problem Classify Program Finish", 0) return problem_classify.CLASSIFY_RES