示例#1
0
    def goldenSectionSearch(self, obj, a, b, n):

        l = a + 0.382 * (b - a)
        h = a + 0.618 * (b - a)
        region = b - a
        num = 1

        while (region > 0.01 and num <= n):
            fl = self.lossFunction(obj, l)
            fh = self.lossFunction(obj, h)
            #print("iter{0} fl={1:.4f} fh{2:.4f}".format(num,fl,fh))
            if (fl > fh):
                a = l
                l = h
                h = a + 0.618 * (b - a)
            else:
                b = h
                h = l
                l = a + 0.382 * (b - a)
            num += 1
            region = abs(b - a)
            utils.updateProgress(self.__scene)  #0.6sec

        moveVal = (a + b) / 2
        return moveVal
示例#2
0
    def bruteForceSearch(self, obj, valList):

        errList = []
        for val in valList:
            err = self.lossFunction(obj, val)
            errList.append(err)
            utils.updateProgress(self.__scene)
        minIdx = errList.index(min(errList))
        moveVal = valList[minIdx]

        return moveVal
示例#3
0
文件: manga.py 项目: jawb/MangaGetter
def download(title, chapters):

	dir = DOWNLOAD+title+os.sep

	utils.mkdir_p(dir)

	print "Downloading "+title+" (", len(chapters) ,")..."
	for chap in chapters:

		utils.mkdir_p(dir+str(chap.number))

		images = PLUGIN.getImages(chap.link)
		for i,img in enumerate(images):
			utils.putUrlContent(img.link, dir+str(chap.number)+
				os.sep+img.title+utils.getExtension(img.link))
			utils.updateProgress(chap.title, int(i*1.0/(len(images)-1)*100))
		print ""

		file = open(dir+str(chap.number)+os.sep+'.completed', 'w+')
def getTransactions(fileSrc, products):
	# main variables to be returned
	transactions = []
	reverseItemLookup = {}
	for key in products:
		reverseItemLookup[key[0]] = set()
	dataFile = codecs.open(fileSrc, 'rb', 'utf-8') 	# specify utf-8 encoding
	print "Loading Transactions..."
	lines = dataFile.readlines() 					# read all lines
	if settings.PROGRESS_BAR == True:
		util.updateProgress(0)						# create a progress bar

	# test every line and extract its relevant information
	for idx, line in enumerate(lines):				# test each line
		if settings.PROGRESS_BAR == True:
			util.updateProgress(float(idx) / float(len(lines)))
		lineList = line.split(", ")
		# Remove first item in the list
		lineList.pop(0)

		lineSet = set()
		for idx2, item in enumerate(lineList):
			if float(item) != 0:
				# Add the index to our list to indicate that the product has been bought
				reverseItemLookup[products[idx2][0]].add(idx)
				lineSet.add(idx2)
		# append our array to our transactions
		transactions.append(lineSet);
	if settings.PROGRESS_BAR == True:
		util.updateProgress(1)
		print "\n"
	# return our list of products
	return [transactions, reverseItemLookup]
def getProducts(fileSrc):
	# main variables to be returned
	products = []
	dataFile = codecs.open(fileSrc, 'rb', 'utf-8') 	# specify utf-8 encoding
	print "Retrieving products..."
	lines = dataFile.readlines() 					# read all lines
	if settings.PROGRESS_BAR == True:
		util.updateProgress(0)						# create a progress bar
	# test every line and extract its relevant information
	for idx, line in enumerate(lines):				# test each line
		if settings.PROGRESS_BAR == True:
			util.updateProgress(float(idx) / float(len(lines)))
		lineList = line.split(", ")
		lineList[1] = float(lineList[1])
		products.append(lineList);
	if settings.PROGRESS_BAR == True:
		util.updateProgress(1)
		print "\n"
	# return our list of products
	return products
示例#6
0
	def __init__(self, trainingData, attributes):
		print "Training Bayesian Classifier with " + str(len(trainingData)) + " data entries.\n"
		# COUNT VARIABLES
		print "Counting all variables:"
		if settings.PROGRESS_BAR == True:
			util.updateProgress(0)
		# Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable
		numOfEntries = float(len(trainingData))
		categoricalCounts = {}		# Holds counts of each category
		self.classifierBins = {}	# Holds the data points for each classifier
		self.probability = {}
		self.numericBins = {}
		count = 0.0
		for entry in trainingData:	# for every data row...
			count += 1.0
			if settings.PROGRESS_BAR == True:
				util.updateProgress(count / (numOfEntries))
			for attr in entry:		# for each attribute...
				if util.isNumber(entry[attr]) == False:			# for categorical attributes
					if entry[attr] in categoricalCounts:		# if we have already created a key for this
						categoricalCounts[entry[attr]] += 1.0	# increment the key
					else:										# otherwise we create a new key and set it to 1
						categoricalCounts[entry[attr]] = 1.0
					if attr == settings.CLASSIFIER_NAME:		# if we are on the classifier, in this case "class"
						if entry[attr] in self.classifierBins:	# add the row to the classifier bins,
							self.classifierBins[entry[attr]].append(entry)
						else:
							self.classifierBins[entry[attr]] = [entry]
				else:															# For Numeric Attributes
					key = attr + ' given ' + entry[settings.CLASSIFIER_NAME]  	# declare a key 
					if key in self.numericBins:									# if the key is already in our numeric bins
						bisect.insort(self.numericBins[key], entry[attr])		# insert the numeric attribute in a sorted location
					else:
						self.numericBins[key] = [entry[attr]]					# if it doesn't exist, create a list for it
		# DEAL WITH CONTINUOUS VARIABLES
		initialKeys = self.numericBins.keys()
		for key in initialKeys:
			self.numericBins[key + " mean"] = np.mean(self.numericBins[key])	# store mean of each prob
			self.numericBins[key + " stdev"] = np.std(self.numericBins[key])	# store std deviation of each continuous var
		for attr in attributes:									# if we have not stored values for certain attributes, we do so now, using smoothing techniques
			if attr[1] != 'real':
				for attrType in attr[1]:
					if attrType not in self.probability:
						self.probability[attrType] = .5 / numOfEntries
						for name in self.classifierBins:
							self.probability[attrType + " given " + name] = .5 / len(self.classifierBins[name])



		# ASSIGN PROBABILITIES
		print "\n\nAssigning probabilities:"
		# Now we have two bins, each holding our different classifiers and counts of all our variables
		if settings.PROGRESS_BAR == True:
			util.updateProgress(0)
		for key in categoricalCounts.keys(): 							# Assign categorical counts
			self.probability[key] = self.getProbability(categoricalCounts[key], numOfEntries)
		attrs = categoricalCounts.keys()			# get the attrs we will iterate through
		count = 0.0									# create a count used to log to the status bar
		for key in self.classifierBins.keys():		# for each classifier type...
			count += 1
			if settings.PROGRESS_BAR == True:
				util.updateProgress(count / float(len(self.classifierBins.keys()))) # update progress bar
			
			for row in self.classifierBins[key]:			# for each row in the classifierBins...
				for rowKey in row:							# for each key in the row...
					if util.isNumber(row[rowKey]) == False:	# if we're dealing with a categorical variable...
						newKey = row[rowKey] + " given " + key  # create a key variable
						if newKey in categoricalCounts:			# count number of items included in that section
							categoricalCounts[newKey] += 1.0
						else:
							categoricalCounts[newKey] = 1.0
			for attrValue in attrs:								# for every attrValue...
				countKey = attrValue + " given " + key 			# create a key
				if countKey in categoricalCounts:				# add to categoricalCounts our conditional probabilities
					self.probability[countKey] = self.getProbability(categoricalCounts[countKey], len(self.classifierBins[key])) 	# Assign conditional probabilities
				else:
					self.probability[countKey] = self.getProbability(0, len(self.classifierBins[key]))
		if settings.PROGRESS_BAR == True:
			util.updateProgress(1)
		print "\nModel creation complete\n"
示例#7
0
def readArff(fileSrc):
	# main variables to be returned
	relation = ""									# relation		
	attributes = []									# attribute list
	rawData = []									# main data storage
	reverseLookup = {}								# store by value for reverse lookup
	continuousVariables = {}
	categoricalVariables = {}
	dataFile = codecs.open(fileSrc, 'rb', 'utf-8') 	# specify utf-8 encoding
	print "Reading file..."
	lines = dataFile.readlines() 					# read all lines
	if settings.PROGRESS_BAR == True:
		util.updateProgress(0)					# create a progress bar
	# test every line and extract its relevant information
	for idx, line in enumerate(lines):				# test each line
		if settings.PROGRESS_BAR == True:
			util.updateProgress(float(idx) / float(len(lines)))
		if line[0] == '%':							# ignore comments
			continue
		elif line[0] == '@':						# if is metadata
			if '@relation' in line:					# if relation
				arrayLine = line.split(" ")
				relation = arrayLine[1]
			elif "@attribute" in line:				# if attribute
				arrayLine = line.split(" ")
				attributes.append([arrayLine[1]])
				if "real" not in arrayLine[2]:		# if attribute is not real (is categorical)
					attrs = re.search('\{(.*?)\}', line).group()	# select text between brackets
					attrs = re.sub('[\{\}]', "", attrs)				# remove brackets
					newAttrs = attrs.split(", ")					
					options = []
					for attr in newAttrs:
						options.append(attr)
					attributes[len(attributes) - 1].append(options)
				else: 							# if it is real
					attributes[len(attributes) - 1].append('real')
		elif line[0] == " ":
				continue
		else:
			line = line.replace(" ", "")
			line = line.replace("\n", "")
			line = line.split(",")
			newDataEntry = {}							# create a new object to store our row data
			for idx, value in enumerate(line):			# for every column of data
				attribute = attributes[idx]
				if util.isNumber(value):						# convert string to float if it's a number
					value = float(value)
				# Add value to our reverse lookup under the key "attributeName attributeValue"
				rlKey = attribute[0] + " " + str(value) 		# create key for our reverseLookup data structure
				if rlKey in reverseLookup:
					reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later
				else:
					reverseLookup[rlKey] = [len(rawData)]	# create a new arrayList to store our indices if one does not already exist
				# fill our newData Entry
				newDataEntry[attribute[0]] = value 		# store the value under its proper key
				# add variables to our bins
				if attribute[1] == 'real':  				# if the attribute is real, we place it in a continuous bin
					if attribute[0] in continuousVariables:
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])							# add our value to our continuous bin
					else:
						continuousVariables[attribute[0]] = util.continuousBin(attribute[0])	# instantiate a continuous bin to hold our variable
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])
				else:									# if the attribute is categorical, we place it in a categorical bin
					if attribute[0] in categoricalVariables:
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
					else:
						categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1])
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
			rawData.append(newDataEntry)					# append data entry to all of our data
	# END OF FOR LOOP
	results = {}
	results['data'] = rawData
	results['attributes'] = attributes
	results['relation'] = relation
	results['lookup'] = reverseLookup
	results['continuousVariables'] = continuousVariables
	results['categoricalVariables'] = categoricalVariables
	if settings.PROGRESS_BAR == True:
		util.updateProgress(1)
	print "\nFile read complete \n"
	return results
示例#8
0
    def optimize (self, X, Y ):
        """ Optimizes the number of trees (estimators) and max features used (features)
            and returns the best values, acording to the OOB criteria

            The results are shown in a diagnostic plot

            To avoid including many trees to produce tiny improvements, increments of OOB error
            below 0.01 are considered irrelevant
        """

        RANDOM_STATE = 1226
        errors = {}
        features = ['sqrt','log2','none']

        if self.quantitative:
            tclf = {'sqrt': RandomForestRegressor(warm_start=False, oob_score=True,
                        max_features="sqrt",random_state=RANDOM_STATE),
                    'log2': RandomForestRegressor(warm_start=False, oob_score=True,
                        max_features="log2",random_state=RANDOM_STATE),
                    'none': RandomForestRegressor(warm_start=False, oob_score=True,
                        max_features=None  ,random_state=RANDOM_STATE) }
        else:
            tclf = {'sqrt': RandomForestClassifier(warm_start=False, oob_score=True,
                        max_features="sqrt",random_state=RANDOM_STATE,
                        class_weight=self.class_weight),
                    'log2': RandomForestClassifier(warm_start=False, oob_score=True,
                        max_features="log2",random_state=RANDOM_STATE,
                        class_weight=self.class_weight),
                    'none': RandomForestClassifier(warm_start=False, oob_score=True,
                        max_features=None  ,random_state=RANDOM_STATE,
                        class_weight=self.class_weight) }

        # Range of `n_estimators` values to explore.
        min_estimators = 15
        max_estimators = 700
        stp_estimators = 100

        num_steps = int((max_estimators-min_estimators)/stp_estimators)

        print 'optimizing RF....'
        updateProgress (0.0)

        optValue = 1.0e10
        j = 0
        for fi in features:
            errors[fi] = []
            count = 0
            for i in range(min_estimators, max_estimators + 1,stp_estimators):
                clf = tclf[fi]
                clf.set_params(n_estimators=i)
                clf.fit(X,Y)
                oob_error = 1 - clf.oob_score_
                errors[fi].append((i,oob_error))
                if oob_error < optValue:
                    if np.abs(oob_error - optValue) > 0.01:
                        optValue = oob_error
                        optEstimators = i
                        optFeatures = fi

                updateProgress (float(count+(j*num_steps))/float(len(features)*num_steps))
                count = count+1
            j=j+1

        for ie in errors:
            xs, ys = zip (*errors[ie])
            plt.plot(xs, ys, label=ie)

        plt.xlim(min_estimators, max_estimators)
        plt.xlabel("n_estimators (Trees)")
        plt.ylabel("OOB error rate")
        plt.legend(loc="upper right")
        plt.show()

        plt.savefig(self.vpath+"/rf-OOB-parameter-tuning.png")
        plt.savefig("./rf-OOB-parameter-tuning.png")

        print 'optimum features:', optFeatures, 'optimum estimators:', optEstimators, 'best OOB:', optValue

        return (optEstimators, optFeatures)
示例#9
0
    def varSelectionFFD (self, X, Y , A, autoscale=False, gui=True):

        # TODO : set dummyStep and ratio as tunable parameters
        
        dummyStep = 4.0
        ratio     = 2.0

        # TODO : check the number of X variables. FFD is not suitable for very large X matrices

        # build a X reduced matrix Xr
        nobj, nvarx = np.shape (X)
        nvarxOri = nvarx
        index = np.ones(nvarx,dtype=np.int)
        st = np.std (X, axis=0, ddof=1)
        for i in range (nvarx):
            if  st[i] < 1e-10:
                index[i] = 0  # set to 0 to allow creation of reduced matrices
        nvarxb = np.sum(index)

        #print index
        
        Xb = np.empty((nobj, nvarxb), dtype=np.float64)
        k=0
        for i in range (nvarx):
            if index[i]>0:
                Xb[:,k]=X[:,i]
                k+=1
        
        nobj, nvarx = np.shape (Xb)
        ndummy = int (np.floor(nvarx/dummyStep))              # number of dummy variables
        nvarxm = nvarx + ndummy                               # length of expanded vector
        ncomb, design  = generateDesignFFD (nvarxm, ratio)    # ncomb is the number of reduced models to be generated
                                                              # design is the matrix that designates is every x variable
                                                              # is in/out of the design matrix
        # print nvarx, ndummy, nvarxm, ncomb

        # obtain first estimation of Y std error
        SSY0 = 0.0
        for i in range (nobj):
            SSY0+=np.square(Y[i]-np.mean(Y))
        SDEP0 = np.sqrt(SSY0/float(nobj))
        SDEP0x10 = 10.0 * SDEP0

        # initializes effects
        effect  = np.zeros(nvarxm,dtype=np.float64)
        xdesign = np.zeros(nvarx ,dtype=np.int)

        # set common model stuff
        self.autoscale = autoscale
        self.Y = Y.copy()

        if gui: updateProgress (0.0)
        
        for i in range(ncomb):

            # extract x design line (not considering dummies)            
            k=0
            for j in range (nvarxm):
                if j%(dummyStep+1) :         # non-dummy var
                    xdesign[k]=design[i][j]
                    k+=1
                    
            nvarxr = int(np.sum(xdesign>0))

            # if this design line contains few x vars skip the model validation
            if nvarxr <= (A+1) : continue
            
            # build a X reduced matrix Xr
            Xr = np.empty((nobj, nvarxr), dtype=np.float64)
            k=0
            for j in range (nvarx):
                if xdesign[j]>0:
                    Xr[:,k]=Xb[:,j]
                    k+=1

            # set the reduced matrix as model matrix and validate
            self.X = Xr.copy()
            self.validateLOO (A)
            
            # accumulate the min SDEP to a effect vector for every variable (including dummies)
            minSDEP = 2.0e10
            for a in self.SDEP:
                if a < minSDEP : minSDEP = a

            if minSDEP > SDEP0x10:
                minSDEP = SDEP0
            
            effect += design[i]*minSDEP

            if gui: updateProgress (float(i)/float(ncomb))

        # calculate effects
        effect /= (ncomb/2)
        
        # compute dummy effects
        dummyEffect = 0.00
        dummyMean = 0.00
        k  = 0

        for i in range(nvarxm):
            if not (i%(dummyStep+1)) :   # dummy var
                dummyMean+=effect[i]
        dummyMean/=ndummy

        for i in range(nvarxm):
            if not (i%(dummyStep+1)) :   # dummy var
                dummyEffect+=np.square(effect[i]-dummyMean)
                ##dummyEffect+=np.square(effect[i])                 ## old version: assuming mean of zero (?) 
                ##td+=1
            else :
                effect[k]=effect[i]
                k+=1

        if dummyEffect > 1e-6:
            dummySD = np.sqrt(dummyEffect/ndummy)
        else :
            dummySD = 0.001
            
        # compare with critical T values (two tail, 95%)    
        t = stats.t.ppf(0.9725,ndummy-1)
        effectCutoff = t * dummySD
        
        res = np.ones(nvarx,dtype=np.int)          # fixed (default)      
        for i in range(nvarx):
            if np.abs(effect[i]) < effectCutoff:   # uncertain
                res[i] = 2
            elif effect[i] > 0 :
                res[i] = 0                         # excluded

        #print res

        # map the result in a vector representing the full, original X
        resExp = np.ones(nvarxOri,dtype=np.int)
        k = 0
        for i in range (nvarxOri):
            if index[i]==0:
                resExp[i] = 0       # these were already excluded or are inactive variables
            else :
                resExp[i] = res[k]
                k += 1
        
        return resExp, np.sum(res==0)
示例#10
0
    def validateLOO (self, A, gui=False):
        """ Validates A dimensions of an already built PLS model, using Leave-One-Out cross-validation

            Returns nothing. The results of the cv (SSY, SDEP and Q2) are stored internally
        """

        if self.X == None or self.Y == None:
            return 
        
        X = self.X
        Y = self.Y     

        nobj,nvarx = np.shape (X)

        SSY0 = 0.0
        for i in range (nobj):
            SSY0+=np.square(Y[i]-np.mean(Y))

        SSY = np.zeros(A,dtype=np.float64)
        YP = np.zeros ((nobj,A+1),dtype=np.float64)

        if gui: updateProgress (0.0)
        
        for i in range (nobj):
            
            # build reduced X and Y matrices removing i object
            Xr = np.delete(X,i,axis=0)
            Yr = np.delete(Y,i)

            Xr,muxr = center(Xr)
            Xr,wgxr = scale (Xr, self.autoscale)
           
            Yr,muyr = center(Yr)

            xp = np.copy(X[i,:])
            
            xp -= muxr
            xp *= wgxr
            
            # predicts y for the i object, using A LV
            yp = self.getLOO(Xr,Yr,xp,A)      
            yp += muyr

            # updates SSY with the object i errors
            YP[i,0]=Y[i]
            
            for a in range(A):
                SSY[a]+= np.square(yp[a]-Y[i])
                YP[i,a+1]=yp[a]

            if gui : updateProgress (float(i)/float(nobj))

        if gui : print
        
        self.SSY  = SSY        
        self.SDEP = [np.sqrt(i/nobj) for i in SSY]
        self.Q2   = [1.00-(i/SSY0) for i in SSY]
        
        self.Av = A

        return (YP)
示例#11
0
def readArff(fileSrc):
    # main variables to be returned
    relation = ""  # relation
    attributes = []  # attribute list
    rawData = []  # main data storage
    reverseLookup = {}  # store by value for reverse lookup
    continuousVariables = {}
    categoricalVariables = {}
    dataFile = codecs.open(fileSrc, 'rb', 'utf-8')  # specify utf-8 encoding
    print "Reading file..."
    lines = dataFile.readlines()  # read all lines
    if settings.PROGRESS_BAR == True:
        util.updateProgress(0)  # create a progress bar
    # test every line and extract its relevant information
    for idx, line in enumerate(lines):  # test each line
        if settings.PROGRESS_BAR == True:
            util.updateProgress(float(idx) / float(len(lines)))
        if line[0] == '%':  # ignore comments
            continue
        elif line[0] == '@':  # if is metadata
            if '@relation' in line:  # if relation
                arrayLine = line.split(" ")
                relation = arrayLine[1]
            elif "@attribute" in line:  # if attribute
                arrayLine = line.split(" ")
                attributes.append([arrayLine[1]])
                if "real" not in arrayLine[
                        2]:  # if attribute is not real (is categorical)
                    attrs = re.search(
                        '\{(.*?)\}',
                        line).group()  # select text between brackets
                    attrs = re.sub('[\{\}]', "", attrs)  # remove brackets
                    newAttrs = attrs.split(", ")
                    options = []
                    for attr in newAttrs:
                        options.append(attr)
                    attributes[len(attributes) - 1].append(options)
                else:  # if it is real
                    attributes[len(attributes) - 1].append('real')
        elif line[0] == " ":
            continue
        else:
            line = line.replace(" ", "")
            line = line.replace("\n", "")
            line = line.split(",")
            newDataEntry = {}  # create a new object to store our row data
            for idx, value in enumerate(line):  # for every column of data
                attribute = attributes[idx]
                if util.isNumber(
                        value):  # convert string to float if it's a number
                    value = float(value)
                # Add value to our reverse lookup under the key "attributeName attributeValue"
                rlKey = attribute[0] + " " + str(
                    value)  # create key for our reverseLookup data structure
                if rlKey in reverseLookup:
                    reverseLookup[rlKey].append(
                        len(rawData)
                    )  # append index of our current row (the length of data) for quick lookup later
                else:
                    reverseLookup[rlKey] = [
                        len(rawData)
                    ]  # create a new arrayList to store our indices if one does not already exist
                # fill our newData Entry
                newDataEntry[attribute[
                    0]] = value  # store the value under its proper key
                # add variables to our bins
                if attribute[
                        1] == 'real':  # if the attribute is real, we place it in a continuous bin
                    if attribute[0] in continuousVariables:
                        continuousVariables[attribute[0]].add(
                            value,
                            line[len(line) -
                                 1])  # add our value to our continuous bin
                    else:
                        continuousVariables[attribute[0]] = util.continuousBin(
                            attribute[0]
                        )  # instantiate a continuous bin to hold our variable
                        continuousVariables[attribute[0]].add(
                            value, line[len(line) - 1])
                else:  # if the attribute is categorical, we place it in a categorical bin
                    if attribute[0] in categoricalVariables:
                        categoricalVariables[attribute[0]].add(
                            value, line[len(line) - 1])
                    else:
                        categoricalVariables[
                            attribute[0]] = util.categoricalBin(attribute[1])
                        categoricalVariables[attribute[0]].add(
                            value, line[len(line) - 1])
            rawData.append(
                newDataEntry)  # append data entry to all of our data
    # END OF FOR LOOP
    results = {}
    results['data'] = rawData
    results['attributes'] = attributes
    results['relation'] = relation
    results['lookup'] = reverseLookup
    results['continuousVariables'] = continuousVariables
    results['categoricalVariables'] = categoricalVariables
    if settings.PROGRESS_BAR == True:
        util.updateProgress(1)
    print "\nFile read complete \n"
    return results