def pretreatment(self): #read data [title, content, result] = self.DT.read_excel(self.origin_data_file) for i in range(len(result)): if result[i] < 0: result[i] = -1 PT = PreTreater() keydata = PT.get_keywords(content) wd_dict = PT.getdict() traindata = PT.create_train_data_dict(wd_dict, keydata) #if self.model_dict['lrTmodel']: keydata_title = PT.get_keywords(title, all_tag=True) train_title_data = PT.create_train_data_dict(wd_dict, keydata_title) np.save(self.wd_dict_file, [wd_dict]) np.save(self.data_title_file, [train_title_data]) #if self.model_dict['scoreModel']: [wd_id_dict, id_score_dict] = PT.get_score_dict() traindata_score = PT.create_train_data_dict(wd_id_dict, keydata) np.save(self.wd_id_dict_file, [wd_id_dict]) np.save(self.id_score_dict_file, [id_score_dict]) np.save(self.data_score_file, [traindata_score]) traindata_title_score = PT.create_train_data_dict(wd_id_dict, keydata_title) np.save(self.data_score_title_file, [traindata_title_score]) #traindata = self.normalize_data(trainData) np.save(self.data_file, [traindata, np.array(result)]) self.create_random_seed(len(result))
def cacul(self, x_te, id_score_dict): sum_score = np.zeros((x_te.shape[0]), dtype=float) # sum_score_cp = np.zeros((x_te.shape[0]), dtype=float) id_score_vector = np.array(np.array(id_score_dict.values())[:, 1], dtype=float) #only work for the sparse data for row_idx in range(x_te.shape[0]): row, col = x_te[row_idx].nonzero() # sum_score[row_idx] = np.dot(id_score_vector[col], x_te[row_idx, :].data) sum_score[row_idx] = np.sum(id_score_vector[col]) # pos_mean = np.mean(sum_score[sum_score > 0]) # neg_mean = np.mean(sum_score[sum_score < 0]) # sum_score_cp[sum_score > pos_mean] = 1 # sum_score_cp[sum_score < neg_mean] = -1 sum_score = sum_score / (max(sum_score) - min(sum_score)) return sum_score if __name__ == '__main__': DSM = DictScoreModel() PT = PreTreater() wd_score_idx, id_score_idx = PT.get_score_dict('../data/score.txt') np.save('../data/id_score_dict.npy', [id_score_idx]) from scipy.sparse import csr_matrix content = csr_matrix(([1, 1, 1], [5, 10, 22], [0, 2, 3]), shape=((2, 100)), dtype=float) print DSM.predict(content)
else: print 'failed to load the file of wd_score_dict' sys.exit(2) def cacul(self, x_te, id_score_dict): sum_score = np.zeros((x_te.shape[0]), dtype=float) # sum_score_cp = np.zeros((x_te.shape[0]), dtype=float) id_score_vector = np.array(np.array(id_score_dict.values())[:, 1], dtype=float) #only work for the sparse data for row_idx in range(x_te.shape[0]): row, col = x_te[row_idx].nonzero() # sum_score[row_idx] = np.dot(id_score_vector[col], x_te[row_idx, :].data) sum_score[row_idx] = np.sum(id_score_vector[col]) # pos_mean = np.mean(sum_score[sum_score > 0]) # neg_mean = np.mean(sum_score[sum_score < 0]) # sum_score_cp[sum_score > pos_mean] = 1 # sum_score_cp[sum_score < neg_mean] = -1 sum_score = sum_score/(max(sum_score) - min(sum_score)) return sum_score if __name__ == '__main__': DSM = DictScoreModel() PT = PreTreater() wd_score_idx, id_score_idx = PT.get_score_dict('../data/score.txt') np.save('../data/id_score_dict.npy', [id_score_idx]) from scipy.sparse import csr_matrix content = csr_matrix(([1,1,1], [5, 10 ,22], [0,2,3]), shape=((2,100)), dtype=float) print DSM.predict(content)