def create_descriptors_pca(self, dim=90): ''' 计算描述子pca :param dim: :return: ''' print("start create_descriptors_pca ...") query = DB.DescriptorModel.select( DB.DescriptorModel.id, DB.DescriptorModel.descriptor).tuples().iterator() features = numpy.array(map(lambda x: [x[0]] + list(x[1]), query)) print("create_descriptors_pca,count=%d,dim=%d" % (len(features), dim)) start = time() print("build eigenvectors start time %s" % start) mean, eigenvectors = cv2.PCACompute(features[:, 1:], None, maxComponents=dim) fitted = cv2.PCAProject(features[:, 1:], mean, eigenvectors) #pca = PCA(n_components=dim) #fitted = pca.fit_transform(features[:,1:]) print("build eigenvectors cost time %s" % (time() - start)) print("saving data ...") #scaler = preprocessing.MinMaxScaler() #pca = scaler.fit_transform(pca) DB.db.connect() with DB.db.transaction(): DB.PcaModel.drop_table(fail_silently=True) DB.PcaModel.create_table() #res = DB.TrainingResult() #res.name = "daisy_pca" #res.data = pca #res.save() for i in range(0, len(fitted)): model = DB.PcaModel() model.pca = fitted[i] model.feature = features[i][0] model.save() DB.TrainingResult.delete().where( DB.TrainingResult.name == "pca_mean").execute() DB.TrainingResult.delete().where( DB.TrainingResult.name == "pca_eigenvectors").execute() tr = DB.TrainingResult() tr.name = "pca_mean" tr.data = mean tr.save() tr = DB.TrainingResult() tr.name = "pca_eigenvectors" tr.data = eigenvectors tr.save() print("create_descriptors_pca done")
def create_classifier(self): DB.db.connect() clf = SGDClassifier(loss="modified_huber") labs_map = NameToIndex() with DB.db.transaction(): offset = 0 words_count = self.get_words_count() classes = numpy.arange(0, words_count) x_all = [] y_all = [] while True: print ' %d partial_fit %d' % (time(), offset) query = DB.Vocabulary\ .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\ .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\ .tuples().iterator() features = numpy.array( map(lambda x: [x[0]] + list(x[1]), query)) offset += len(features) if len(features) == 0: break Y = features[:, 0] X = features[:, 1:] labs = [] for lab in Y: labs.append(labs_map.map(lab)) if (len(x_all) < 10000): x_all = x_all + X.tolist() y_all = y_all + labs labs = numpy.array(labs) #clf = LinearSVC() #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear')) #clf.fit(X,labs) clf.partial_fit(X, labs, classes) print clf.score(x_all, y_all) DB.TrainingResult.delete().where( DB.TrainingResult.name == self.__class__.__name__ + "_clf").execute() DB.TrainingResult.delete().where( DB.TrainingResult.name == self.__class__.__name__ + "_labs_map").execute() tr = DB.TrainingResult() tr.name = self.__class__.__name__ + "_clf" tr.data = clf tr.save() tr = DB.TrainingResult() tr.name = self.__class__.__name__ + "_labs_map" tr.data = labs_map tr.save()
def cluster_words_all(self): ''' 对所有样本进行聚类 ''' print "start cluster_words_all ..." offset = 0 limit = 300 cluster = MiniBatchKMeans(n_clusters=100, verbose=1) while True: print ' %d partial_fit %d' % (time(), offset) query = DB.PcaModel.select(DB.PcaModel.feature,DB.PcaModel.pca)\ .offset(offset).limit(limit).tuples().iterator() features = numpy.array(map(lambda x: [x[0]] + list(x[1]), query)) if len(features) == 0: break offset += len(features) X = features[:, 1:] cluster.partial_fit(X) DB.db.connect() with DB.db.transaction(): DB.Vocabulary.drop_table(fail_silently=True) DB.Vocabulary.create_table() DB.Words.drop_table(fail_silently=True) DB.Words.create_table() offset = 0 while True: query = DB.PcaModel.select( DB.PcaModel.feature, DB.PcaModel.pca).offset(offset).limit( 1000).tuples().iterator() features = numpy.array( map(lambda x: [x[0]] + list(x[1]), query)) if len(features) == 0: break offset += len(features) X = features[:, 1:] Y = features[:, 0] res = cluster.predict(X) for i in range(0, len(res)): DB.Words.insert(id=res[i]).upsert().execute() DB.Vocabulary.insert(word=res[i], feature=Y[i]).execute() DB.TrainingResult.delete().where( DB.TrainingResult.name == self.__class__.__name__ + "_clf").execute() tr = DB.TrainingResult() tr.name = self.__class__.__name__ + "_clf" tr.data = cluster tr.save() #print "%d words, %d core samples, %d noise"%(len(types.keys()),len(res.core_sample_indices_), len(types[-1]) ) print "done cluster_words_all" #self.display_words() return cluster