def ccTopic(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset nRowCluster = options.nRowCluster nTopic = options.nTopic ccType = options.ccType kernelType = options.kernelType nFold = options.nFold nCodeword = options.nCodeword beta = options.beta if (options.verbose): print dataset, nRowCluster, nTopic, ccType, kernelType, beta, nFold, nCodeword print options dataPath = rootDir + dataset + bofDir catmap = getCatMap(dataset) catList = catmap.keys() dataext = str(nCodeword) + bofext nCategory = len(catList) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCategory, catName in enumerate(catList): fileName = dataPath + catName + dataext catpos = np.genfromtxt(fileName, dtype=np.int) if (options.verbose): print catName catpos = catpos[:, :nCodeword + 1] catpos[:, nCodeword] = 1 #read the category data of remaining classes for cats in catList: if (cats != catName): firstvisit = True if (firstvisit): catneg = np.genfromtxt(fileName, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(fileName, dtype=np.int)), axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] catneg = catneg[:, :nCodeword + 1] catneg[:, nCodeword] = 0 #combine positive and negative data bofData = np.concatenate((catpos, catneg), axis=0) if (options.verbose): print 'co-clustering...' ccData = cocluster.coclust(bofData, dataset, nRowCluster, nTopic, ccType) ccCol = np.array([int(i) for i in ccData[1].split()]) tempCC = np.zeros((bofData.shape[0], nTopic)) for i in np.arange(bofData.shape[0]): for j in sorted(set(ccCol)): tempCC[i, j] = np.sum(bofData[i, ccCol == j]) botData = np.vstack((tempCC.T, bofData[:, -1])).T if (options.verbose): print 'classifying...' #catPerfSVM = classify.ccClassify(botData, kernelType, nFold, beta, nMetrics) catPerfKNN = classify.knnClassify(botData, 10, nFold, beta, nMetrics) #perfMean[iCategory,0] = np.mean(catPerfSVM) #perfStd[iCategory,0] = np.std(catPerfSVM) perfMean[iCategory] = np.mean(catPerfKNN) perfStd[iCategory] = np.std(catPerfKNN) if (options.verbose): print perfMean print perfStd return [perfMean, perfStd]
def ccUniversalTopicDictionary(wordn, topicn): #acquire program agruments (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset nRowCluster = options.nRowCluster ccType = options.ccType #nCodeword = options.nCodeword #nTopic = options.nTopic nCodeword = wordn nTopic = topicn #echo arguments if (options.verbose): print dataset, nRowCluster, nTopic, ccType, nCodeword print options #configure data path and other parameters dataPath = rootDir + dataset + imgWrdDir resultPath = rootDir + dataset + utdDir + ccType + dataset catmap = getCatMap(dataset) catList = catmap.keys() dataext = str(nCodeword) + imgWrdext resultext = str(nCodeword) + str(nTopic) + utdext resultFileName = resultPath + resultext if (os.path.exists(resultFileName)): print '%s already written' % (resultFileName) return #flag if incomplete data incompleteData = False for catName in catList: iwmFileName = dataPath + catName + dataext if (os.path.exists(iwmFileName) == False): incompleteData = True print '%s missing in %s,%d' % (catName, dataset, wordn) return else: pass # initialise empty iwm matrix and append each category to it iwmData = None for catName in catList: iwmFileName = dataPath + catName + dataext try: iwmCatData = np.loadtxt(iwmFileName, dtype=np.int16, delimiter=' ') if (options.verbose): print 'reading %s' % (iwmFileName) except: print 'unable to read %s' % (iwmFileName) incompleteData = True return #stack the category data to the existing data-set data if (iwmData == None): iwmData = iwmCatData else: iwmData = np.concatenate((iwmData, iwmCatData), axis=0) pass if (incompleteData == False): if (options.verbose): print 'co-clustering...' ccData = coclust(iwmData, dataset, nRowCluster, nTopic, ccType) # the indices of co-clusters columns ccarray = ccData[1].split() ccCol = np.array(ccarray, dtype=np.int16) if (options.verbose): print 'writing %s' % (resultFileName) np.savetxt(resultFileName, ccCol, fmt='%d', delimiter=' ') else: print 'incomplete data for %s' % (resultFileName)
def ccWord(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset nRowCluster = options.nRowCluster nColCluster = options.nColCluster ccType = options.ccType kernelType = options.kernelType beta = options.beta figfmt = options.figfmt nFold = options.nFold desc = options.desc nClusterSample = options.nClusterSample if (options.verbose): print dataset, nRowCluster, nColCluster, ccType, kernelType, beta, figfmt, nFold, nClusterSample dataPath = rootDir + dataset + dataDir catmap = getCatMap(dataset) catList = catmap.keys() dataext = '.' + desc nCategory = len(catList) dim = descdim.get(desc) nSamplePerCategory = int(np.round(nClusterSample / nCategory)) if (options.verbose): print 'collating cluster data...' clusterData = collateClusterData(dataPath, dataext, catList, nSamplePerCategory, dim) if (options.verbose): print 'coclustering...' ccData = cocluster.coclust(clusterData, dataset, nRowCluster, nColCluster, ccType) ccRow = np.array([int(i) for i in ccData[0].split()]) ccCol = np.array([int(i) for i in ccData[1].split()]) cctemp = np.zeros((clusterData.shape[0], nColCluster)) codebook = np.zeros((nRowCluster, nColCluster)) for i in np.arange(clusterData.shape[0]): for j in sorted(set(ccCol)): cctemp[i, j] = np.linalg.norm(clusterData[i, ccCol == j], 2) for i in sorted(set(ccRow)): codebook[i, :] = np.mean(cctemp[ccRow == i, :], 0) if (options.verbose): print 'writing bof...' writebof(dataset, catList, codebook, ccCol, nRowCluster, desc) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCategory, catName in enumerate(catList): catboffilepath = rootDir + dataset + bofDir + catName + '_cc' + bofext catpos = np.genfromtxt(catboffilepath, dtype=np.int) # catpos catpos = catpos[:, :nColCluster + 1] catpos[:, nColCluster] = 1 for catname in catList: if (catname != catName): firstvisit = True catboffilepath = rootDir + dataset + bofDir + catname + '_cc' + bofext if (firstvisit): catneg = np.genfromtxt(catboffilepath, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(catboffilepath, dtype=np.int)), axis=0) nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] #catneg catneg = catneg[:, :nColCluster + 1] catneg[:, nColCluster] = 0 #combine positive and negative data catData = np.concatenate((catpos, catneg), axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(catData) catPerf = classify.ccClassify(catData, kernelType, nFold, beta, nMetrics) perfMean[iCategory] = np.mean(catPerf) perfStd[iCategory] = np.std(catPerf) if (options.verbose): print perfMean print perfStd plotresult.ccPlot(dataset, catList, perfMean, perfStd, figfmt, 'BoW', ccType)