Пример #1
0
	def refresh(self, ind_val=None):
		# set attributes
		self.ind_list = self.Data.getList(self.ind_attr, False)
		self.dep_list = self.Data.getList(self.dep_attr, True)
		self.dep = {}
		self.ind = list(set(self.ind_list))
		self.listToContours = [self.ind.index(i) for i in self.ind_list]
		self.DU = DistanceUncertainty(self)
		self.indQuantities = [self.ind_list.count(i) for i in self.ind]
		
		# Variables
		self.std = np.std(self.ind_list)
		self.weightFactor = 1.0 / (2 * ((float(self.weight_std_ratio) * self.std) ** 2))
		
		# The points ordered by their dependent attribute
		orderedPoints = zip(self.dep_list, self.ind_list)
		orderedPoints.sort()
		self.sorted_dep_list, self.sorted_ind_list = zip(*orderedPoints)
		if not ind_val is None:
			new_indices = [i for i in range(len(orderedPoints)) if self.sorted_ind_list[i] == ind_val]
			return new_indices
Пример #2
0
class ObservedDistribution:

	def __init__(self, parser, ind_attr, contours , dep_attr, weight_std_ratio=None, retrain=False, prefix=None, save=True):
		#If a None path is provided, make one based on the ind_attr
		if prefix is None:
			prefix = "ods/"+sanitise(ind_attr)+"/"
			if not os.path.exists(prefix):
				os.makedirs(prefix)
		# If no weight is provided then expect to find one in a filename
		if weight_std_ratio is None:
			pattern = re.compile(sanitise(ind_attr)+'_[0-9]_'+sanitise(dep_attr))
			for f in os.listdir(prefix):
				fn,ext = os.path.splitext(f)
				if ext == ".od" and pattern.match(sanitise(fn)) is not None:
					weight_std_ratio = float(fn.rsplit("_")[-1])
					print "Found",f,"and setting weight ratio to",weight_std_ratio
					break
		if weight_std_ratio is None:
			print "No suitable weight factor found in provided OD files, using 0.15."
			weight_std_ratio = 0.15
		self.weight_std_ratio = weight_std_ratio
		self.contours = contours
		self.prefix = prefix
		self.filename = getFileName(ind_attr, contours, dep_attr, weight_std_ratio)
		self.path = os.path.join(prefix,self.filename)
		# if this od has already been computed read in the file it was saved to and copy the attributes from that version
		if os.path.isfile(self.path) and not retrain:
			od = readObject(self.path)
			attributes = inspect.getmembers(od)
			for a in attributes:
				if not type(a[1]) is types.MethodType:
					setattr(self, a[0], a[1])
			print "read in",self.path
		# otherwise build the od as normal
		else:
			self.retrain(parser, ind_attr, dep_attr, save=save)
			
	def __repr__(self):
		return self.filename
	
	def refresh(self, ind_val=None):
		# set attributes
		self.ind_list = self.Data.getList(self.ind_attr, False)
		self.dep_list = self.Data.getList(self.dep_attr, True)
		self.dep = {}
		self.ind = list(set(self.ind_list))
		self.listToContours = [self.ind.index(i) for i in self.ind_list]
		self.DU = DistanceUncertainty(self)
		self.indQuantities = [self.ind_list.count(i) for i in self.ind]
		
		# Variables
		self.std = np.std(self.ind_list)
		self.weightFactor = 1.0 / (2 * ((float(self.weight_std_ratio) * self.std) ** 2))
		
		# The points ordered by their dependent attribute
		orderedPoints = zip(self.dep_list, self.ind_list)
		orderedPoints.sort()
		self.sorted_dep_list, self.sorted_ind_list = zip(*orderedPoints)
		if not ind_val is None:
			new_indices = [i for i in range(len(orderedPoints)) if self.sorted_ind_list[i] == ind_val]
			return new_indices
	
	def retrain(self, parser, ind_attr, dep_attr, save=True):
		self.bins = [0.5]
		# Expand the bins to the number of contours selected (zero contours = just predict the median)
		for i in xrange(self.contours):
			self.bins = np.concatenate([[self.bins[0]/2],self.bins,[1-self.bins[0]/2]])
		# convert self.bins back into a list
		if not type(self.bins) == list:
			self.bins = self.bins.tolist()
		self.bins.sort()
		
		# set parser
		self.Data = parser
		# set attributes
		self.ind_attr = ind_attr
		self.dep_attr = dep_attr
		self.refresh()
		
		# Parallel computation:
		results, self.weights = zip(*Parallel(n_jobs=-1)(delayed(findBins)(value, self.sorted_ind_list, self.sorted_dep_list, weightFunction, self.weightFactor, self.bins, findResults) for value in self.ind))
		
		self.finishTraining(results)
		
		if save and not os.path.isfile(self.path):
			self.saveObject(self.path)
	
	def finishTraining(self, results):
		self.weights = list(self.weights)
		
		# Get Training Dots
		# Make a list of the input values at each bin boundary
		for b in self.bins:
			self.dep[b] = []
		
		for r in results:
			for pair in r:
				self.dep[pair[0]].append(pair[1])
		
		# set self.ind and self.dep (these are what ED will train on)
		self.ind = np.array(self.ind).T
		for b in self.bins:
			self.dep[b] = np.array(self.dep[b])
			#This section avoids a crash that occurs when all of a self.ind[b] have the same value.
			if np.allclose(self.dep[b],self.dep[b][0]):
				self.dep[b][0] *= 1.01
				print "Ran into a Y-axis contour (",b,") that has no variance in any of the point weights."
				print "This probably means the weighting factors are inappropriately large."
				print "Avoiding a crash in the SVR by falsely editing the first datapoint by 1% and then proceeding."
	
	# function to find the weights for each point around a single point on the independent axis
	def weightFunction(self, value, weightFactor=None):
		wf = self.weightFactor
		if weightFactor is not None:
			wf = 1.0 / (2 * ((weightFactor * self.std) ** 2))
		return np.exp(wf*(-(abs(np.array(self.ind_list)-value) ** 2)))
	
	# getter functions
	def distanceUncertainty(self, values):
		return self.DU.distanceUncertainty(values)
	
	def indAttrName(self, san=False):
		if san:
			return sanitise(self.ind_attr)
		return self.ind_attr
	
	def indAttr(self):
		return self.ind
	
	def indAttrList(self):
		return self.ind_list
		
	
	def depAttrName(self, san=False):
		if san:
			return sanitise(self.dep_attr)
		return self.dep_attr
	
	def observedContours(self):
		return self.dep
	
	def scaledDepAttr(self):
		return self.Data.getList(self.dep_attr, True)
	
	def unscaledDepAttr(self):
		return self.Data.getList(self.dep_attr, False)
	
	def limits(self):
		return [[min(self.ind),max(self.ind)],[min(self.unscaledDepAttr()),max(self.unscaledDepAttr())]]
		
	# scale and unscale points trained on the scaled dots
	def unscalePoints(self, vals):
		return vals*np.array(self.Data.getScale(self.dep_attr))+np.array(self.Data.getTranslate(self.dep_attr))
		
	def scalePoints(self,vals):
		return vals/np.array(self.Data.getScale(self.dep_attr))-np.array(self.Data.getTranslate(self.dep_attr))
	
	def plotArtefacts(self,stroke=None,fill='black',plot=None,alpha=1):
		if plot is None:
			plot = pl.figure().add_subplot(1,1,1)
		plot.scatter(self.ind_list, self.unscalePoints(self.dep_list), edgecolor=stroke,facecolor=fill,s=2,lw=0.25,alpha=alpha)
		return plot

	def plotArtefact(self,x=None,y=None,plot=None,alpha=1,ED=None):
		if plot is None:
			plot = pl.figure().add_subplot(1,1,1)
		if x is None:
			minx = min(self.ind_list)
			maxx = max(self.ind_list)
			x = minx+np.random.random()*np.ptp([minx, maxx])
		if y is None:
			y_pred = ED.getExpectationsAt(x,False)
			error_scale = np.mean(y_pred[self.bins[-1]]-y_pred[self.bins[0]])
			y = ED.getExpectationsAt(x,False,medianOnly=True)+((np.random.random()*error_scale)-(0.5*error_scale))
		
		text_pos = pl.ylim()[0]+np.ptp(pl.ylim())*0.025
		
		surprise,raw_surprise = ED.surpriseCalc(x,y,None,False)
		plot.axvline(x)
		plot.scatter(x,y,s=500,c='r',marker='*')
		text = "".join(["  Hypothetical phone: S=",str(round(surprise,3)),' (raw: ',str(round(abs(raw_surprise),3)),')'])
		plot.annotate(text,[x,text_pos],color='b')
		return plot
	
	# plot the results
	def plotObservedContours(self, title="", plot=None, alpha=1):
		if plot is None:
			plot = pl.figure().add_subplot(1,1,1)
		# median dot size (for Kaz)
		med_S = 2
		# regular dot size
		reg_S = .5
		# data size
		data_S = 5
		
		centralIndex = self.bins.index(0.5)
		for i,b in enumerate(self.bins):
			dist_from_med = float(abs(i-centralIndex))/(len(self.bins) *.5)
			color = (1-dist_from_med, dist_from_med, 0)
			if b == 0.5:
				S = med_S
			else:
				S = reg_S
			zipped = zip(self.ind, self.unscalePoints(self.dep[b]))
			zipped.sort()
			x, y = zip(*zipped)
			list(x)
			list(y)
			#if b == 0.5:
			#	print x
			#	print y
			#	sys.exit()
			plot.plot(x, y, color=color, lw=S, alpha=alpha)
		if self.ind_attr is None:
			plot.set_xlabel('$Year$')
		else:
			plot.set_xlabel(self.ind_attr)
		plot.set_ylabel(self.dep_attr)
		if len(title) > 0:
			plot.set_title(title)
		return plot
	
	def plotWeights(self, value, plot=None, alpha=1):
		if plot is None:
			plot = pl.figure().add_subplot(1,1,1)
		weights = self.weightFunction(value)
		weights *= 10000.0/sum(weights)
		weights = np.sqrt(weights)
		plot.scatter(self.ind_list, self.unscalePoints(self.dep_list), color='r', s=weights, alpha=alpha)
		return plot
	
	#Show the current plot(s).
	def show(self):
		pl.show()
	
	#Save the current plot to a given filename.
	def saveFig(self,filename):
		#if os.path.isfile(self.prefix+'/'+filename):
		#	version = 1
		#	while os.path.isfile(self.prefix+'/'+str(version)+'_'+filename):
		#		version += 1
		#	filename = str(version)+'_'+filename
		pl.savefig(filename)
		print 'Saved',filename
	
	def saveObject(self, filename):
		with open(filename, 'wb') as output:
			pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
		print "Saved,",filename