예제 #1
0
    def pretreatment(self):
        #read data
        [title, content, result] = self.DT.read_excel(self.origin_data_file)

        for i in range(len(result)):
            if result[i] < 0:
                result[i] = -1

        PT = PreTreater()
        keydata = PT.get_keywords(content)

        wd_dict = PT.getdict()
        traindata = PT.create_train_data_dict(wd_dict, keydata)

        #if self.model_dict['lrTmodel']:
        keydata_title = PT.get_keywords(title, all_tag=True)
        train_title_data = PT.create_train_data_dict(wd_dict, keydata_title)
        np.save(self.wd_dict_file, [wd_dict])
        np.save(self.data_title_file, [train_title_data])

        #if self.model_dict['scoreModel']:
        [wd_id_dict, id_score_dict] = PT.get_score_dict()
        traindata_score = PT.create_train_data_dict(wd_id_dict, keydata)
        np.save(self.wd_id_dict_file, [wd_id_dict])
        np.save(self.id_score_dict_file, [id_score_dict])
        np.save(self.data_score_file, [traindata_score])

        traindata_title_score = PT.create_train_data_dict(wd_id_dict, keydata_title)
        np.save(self.data_score_title_file, [traindata_title_score])
        #traindata = self.normalize_data(trainData)

        np.save(self.data_file, [traindata, np.array(result)])
        self.create_random_seed(len(result))
예제 #2
0
    def pre_data_treate(self, filename):
        test_title, test_content, empty = self.DT.read_excel(filename)
        [wd_dict] = np.load(self.wd_dict_file)
        [wd_score_dict] = np.load(self.wd_id_dict_file)

        PT = PreTreater()
        keydata = PT.get_keywords(test_content)
        testdata = PT.create_train_data_dict(wd_dict, keydata)
        test_score_data = PT.create_train_data_dict(wd_score_dict, keydata)      
        
        keydata_title = PT.get_keywords(test_title)
        testdata_title = PT.create_train_data_dict(wd_dict, keydata_title)
        test_score_data_title = PT.create_train_data_dict(wd_score_dict, keydata_title)

        return [testdata, testdata_title, test_score_data, test_score_data_title]