示例#1
0
    def classify_problem(self):
        def __get_gp(x, item):
            sd = item['sd']
            avg = item['avg']
            coe = 1 / (math.sqrt(2 * math.pi) * sd)
            ind = (math.e)**(-((x-avg)*(x-avg)) / (2 * sd * sd))
            return coe * ind

        logging("Start classify problems...", 0)
        for label in self.PROBLEM_VECTOR:
            max_cla = ""
            max_val = 0
            for cla in self.CLASSIFY_AVG_SD:
                if cla == u'others_sub':
                    continue
                cla_p = 1.0
                for i in range(0, self.VEC_CNT):
                    x = self.PROBLEM_VECTOR[label][i]
                    try:
                        p = __get_gp(x, self.CLASSIFY_AVG_SD[cla][i])
                    except Exception, ex:
                        logging("label: %s, cla: %s, dx: %s, error: %s" %(
                            label, cla, i), 2)
                    if p == 0: p = 0.00001
                    cla_p *= p
                try:
                    cla_p *= self.CLASSIFY_P[cla]
                except:
                    cla_p = 0
                if cla_p > max_val:
                    max_val = cla_p
                    max_cla = cla
            if label not in self.CLASSIFY_RES:
                self.CLASSIFY_RES[label] = []
            self.CLASSIFY_RES[label].append(max_cla)
示例#2
0
def naive_bayes_classifier(repo, classify_list, classify_p, knowledge, vector_file):
    logging("Problem Classify Program Start", 0)
    problem_classify = ProblemClassify(repo, classify_list, classify_p, knowledge, vector_file)
    problem_classify.read_problem_vector()
    problem_classify.calculate_knowledge_avg_sd()
    problem_classify.classify_problem()
    logging("Problem Classify Program Finish", 0)
    return problem_classify.CLASSIFY_RES
示例#3
0
 def get_user_aclist(self, step):
     logging("Get all users in %s" % (self.REPO, ), 0)
     users_list = []
     with connection.cursor() as ues_con:
         sql = "SELECT `user` FROM `ues_ojstatus` where `repo`=%s group by `user`"
         ues_con.execute(sql, (self.REPO,))
         users_tuple = ues_con.fetchall()
         users_list = list(user[0] for user in users_tuple)
         sql = "SET GLOBAL group_concat_max_len=%s"
         ues_con.execute(sql, (step*10000,))
         users_tuple = ues_con.fetchone()
     logging("There are %d users in the %s DataBase" % (len(users_list), self.REPO), 0)
     return users_list
示例#4
0
    def read_problem_vector(self):
        logging("Start read the problem vector file...", 0)
        with open(self.PROBLEM_VECTOR_FILE, 'r') as file:
            self.PROBLEM_VECTOR = {}
            pro_cnt, vec_cnt = map(int, file.readline().split(" "))
            self.VEC_CNT = vec_cnt
            for i in range(0, pro_cnt):
                raw_data = file.readline().split(" ")
                if(len(raw_data) == 1 and raw_data[0] == ""):
                    continue
                self.PROBLEM_VECTOR[raw_data[0]] = map(float, raw_data[1:])

        logging("Read the problem vector file finish", 0)
示例#5
0
 def calculate_knowledge_avg_sd(self):
     logging('Start calculate knowledge avg sd...', 0)
     for cla in self.KNOWLEDGE:
         self.CLASSIFY_AVG_SD[cla] = []
         for i in range(0, self.VEC_CNT):
             cla_xi = []
             for label in self.KNOWLEDGE[cla]:
                 try:
                     cla_xi.append(self.PROBLEM_VECTOR[label][i])
                 except Exception, ex:
                     logging("cla:%s, label:%s, error:%s" %(cla, label, ex), 2)
             cla_xi = np.array(cla_xi)
             data = {'avg': np.mean(cla_xi), 'sd': np.std(cla_xi)}
             self.CLASSIFY_AVG_SD[cla].append(data)
示例#6
0
 def make_problem_vector(self):
     logging("Start make problem vector...", 0)
     sentences = word2vec.Text8Corpus(self.USER_ACLIST_FILE)
     # sentences = word2vec.LineSentence(self.USER_ACLIST_FILE)
     model = word2vec.Word2Vec(
         sentences = sentences,
         hs = 0,
         sg = 0,
         iter = 10,
         size = 300,
         window = 5,
         sample = 1e-4,
     )
     model.wv.save_word2vec_format(fname=self.PROBLEM_VECTOR_FILE, binary=False)
     logging("Make problem vector finish", 0)
示例#7
0
class ProblemClassify(object):

    def __init__(self, repo, classify_list, classify_p, knowledge, vector_file):
        super
        (ProblemClassify, self).__init__()
        self.REPO = repo
        self.VEC_CNT = 0
        self.CLASSIFY_RES = {}
        self.PROBLEM_VECTOR = {}
        self.KNOWLEDGE = knowledge
        self.CLASSIFY_P = classify_p
        self.CLASSIFY_LIST = classify_list
        self.PROBLEM_VECTOR_FILE = vector_file

        # {
        #     'class1': [{
        #             'avg': xxx,
        #             'sd'
        #         },
        #         {
        #         ...
        #         }
        #     ],
        #     'class2': [{
        #         }
        #     ]
        # }
        self.CLASSIFY_AVG_SD = {}


    def read_problem_vector(self):
        logging("Start read the problem vector file...", 0)
        with open(self.PROBLEM_VECTOR_FILE, 'r') as file:
            self.PROBLEM_VECTOR = {}
            pro_cnt, vec_cnt = map(int, file.readline().split(" "))
            self.VEC_CNT = vec_cnt
            for i in range(0, pro_cnt):
                raw_data = file.readline().split(" ")
                if(len(raw_data) == 1 and raw_data[0] == ""):
                    continue
                self.PROBLEM_VECTOR[raw_data[0]] = map(float, raw_data[1:])

        logging("Read the problem vector file finish", 0)

    def calculate_knowledge_avg_sd(self):
        logging('Start calculate knowledge avg sd...', 0)
        for cla in self.KNOWLEDGE:
            self.CLASSIFY_AVG_SD[cla] = []
            for i in range(0, self.VEC_CNT):
                cla_xi = []
                for label in self.KNOWLEDGE[cla]:
                    try:
                        cla_xi.append(self.PROBLEM_VECTOR[label][i])
                    except Exception, ex:
                        logging("cla:%s, label:%s, error:%s" %(cla, label, ex), 2)
                cla_xi = np.array(cla_xi)
                data = {'avg': np.mean(cla_xi), 'sd': np.std(cla_xi)}
                self.CLASSIFY_AVG_SD[cla].append(data)
        logging('Calculate knowledge avg sd finish', 0)
示例#8
0
 def get_all_user_aclist(self):
     logging("Start get all user aclist...", 0)
     step = 1000
     filename = os.path.join(TMP_CHDIR, '%s_label_list.txt'%(self.REPO, ))
     user_list = self.get_user_aclist(step)
     for i in range(0, len(user_list), step):
         users = user_list[i: i+step]
         users_label = self.fetch_user_lables(users)
         self.write_user_aclist(users_label, self.USER_ACLIST_FILE)
         logging('Users list %d--%d Done!'%(i, i+step), 0)
     logging("Get all user aclist finish", 0)
示例#9
0
def make_problems_vector(repo, aclist_file, vector_file):
    logging("Make Problems Vectors Main Program Startup...", 0)
    problem_vector = ProblemVector(repo, aclist_file, vector_file)
    problem_vector.get_all_user_aclist()
    problem_vector.make_problem_vector()
    logging("Make Problems Vectors Finish", 0)
示例#10
0
                    x = self.PROBLEM_VECTOR[label][i]
                    try:
                        p = __get_gp(x, self.CLASSIFY_AVG_SD[cla][i])
                    except Exception, ex:
                        logging("label: %s, cla: %s, dx: %s, error: %s" %(
                            label, cla, i), 2)
                    if p == 0: p = 0.00001
                    cla_p *= p
                try:
                    cla_p *= self.CLASSIFY_P[cla]
                except:
                    cla_p = 0
                if cla_p > max_val:
                    max_val = cla_p
                    max_cla = cla
            if label not in self.CLASSIFY_RES:
                self.CLASSIFY_RES[label] = []
            self.CLASSIFY_RES[label].append(max_cla)
        # print self.CLASSIFY_RES
        logging("Classify problems finish", 0)

def naive_bayes_classifier(repo, classify_list, classify_p, knowledge, vector_file):
    logging("Problem Classify Program Start", 0)
    problem_classify = ProblemClassify(repo, classify_list, classify_p, knowledge, vector_file)
    problem_classify.read_problem_vector()
    problem_classify.calculate_knowledge_avg_sd()
    problem_classify.classify_problem()
    logging("Problem Classify Program Finish", 0)
    return problem_classify.CLASSIFY_RES