Пример #1
0
def main():
    d = Data('cars.csv')
    print "Raw Headers"
    print d.get_raw_headers()
    print "\n\n"
    print "Raw number of columns"
    print d.get_raw_num_columns()
    print "\n\n"
    print "Raw number of rows"
    print d.get_raw_num_rows()
    print "\n\n"
    print "13th row"
    print d.get_raw_row(13)
    print "\n\n"
    print "Value at row 6, header 'Car'"
    print d.get_raw_value(6, 'Car')
    print "\n\n"
    print "Matrix data"
    print d.matrix_data
    print "\n\n"
    print "Headers"
    print d.get_headers()
    print "\n\n"
    print "Number of cols"
    print d.get_num_columns()
    print "\n\n"
    print "5th row"
    print d.get_row(5)
    print "\n\n"
    print "Get value"
    print d.get_value(5, 'Horsepower')
    print "\n\n"
    print "get_data function"
    print d.get_data(['Origin', 'Horsepower'])
    print "\n\n"
    print "data range"
    print analysis.data_range(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "mean of horsepower and origin"
    print analysis.mean(d, ['Horsepower', 'Origin'])
    print "\n\n"
    print "standard deviation for horsepower and origin"
    print analysis.stdev(d, ['Horsepower', 'Origin'])
    print "\n"
    print "normalized columns origin and horsepower"
    print analysis.normalize_columns_separately(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "normalized together origin and horsepower"
    print analysis.normalize_columns_together(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "median of columns origin, horspower and weight"
    print analysis.median(d, ['Origin', 'Horsepower', 'Weight'])
    print d.get_data(['Origin', 'Horsepower']).shape
Пример #2
0
def main(argv):

    # test command line arguments
    if len(argv) < 2:
        print('Usage: python %s <csv filename>' % (argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = Data(argv[1])
    headers = dobj.get_headers()
    #test the five analysis functions
    print([headers[0], headers[2]])
    print("Data range by column:",
          analysis.data_range([headers[0], headers[2]], dobj))
    print("Mean:", analysis.mean([headers[0], headers[2]], dobj))
    print("Standard deviation:", analysis.stdev([headers[0], headers[2]],
                                                dobj))
    print(
        "Normalize columns separately:",
        analysis.normalize_columns_separately([headers[0], headers[2]], dobj))
    print("Normalize columns together:",
          analysis.normalize_columns_together([headers[0], headers[2]], dobj))

    #Extension 1
    print("Median:", analysis.median([headers[0], headers[2]], dobj))

    #Extension 2
    print("Median Separately:",
          analysis.median_separately([headers[0], headers[2]], dobj))

    #Extension 3
    print("just  few rows:", dobj.limit_rows())

    #Extension 4
    print(
        "just a few columns. I changed the limit to 2 for demonstration purposes:",
        dobj.limit_columns())

    #Extension 5
    print("Data range overall:",
          analysis.data_range([headers[0], headers[2]], dobj, True))

    #Extension 6
    print(
        "The next two print statements get the last row of data. I add a row of data in between,"
        "so they are different.")
    print(dobj.get_row(-1))
    dobj.add_point([1, 2, 3])
    print(dobj.get_row(-1))
Пример #3
0
def main(argv):

    # test command line arguments
    if len(argv) < 2:
        print('Usage: python %s <csv filename>' % (argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = data.Data(argv[1])

    # print out information about the data
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of columns: ', dobj.get_num_dimensions())

    # print out the headers
    print("\nHeaders:")
    headers = dobj.get_headers()
    s = headers[0]
    for header in headers[1:]:
        s += ", " + header
    print(s)

    # print out the types
    print("\nTypes")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    # print out a single row
    print("\nPrinting row index 2")
    print(dobj.get_row(2))

    # print out all of the data
    print("\nData")
    headers = dobj.get_headers()
    print("headers:", headers)
    for i in range(dobj.get_num_points()):
        s = str(dobj.get_value(headers[0], i))
        for header in headers[1:]:
            s += "%10.3s" % (dobj.get_value(header, i))
        print(s)

    print("\n\n\n\nselect_columns")

    d = dobj.get_data()
    # print("Data:", d)
    s = dobj.select_columns(['thing1', 'thing3'])
    print("Selected columns:", s)

    print("Data range:", analysis.data_range(['thing1', 'thing3'], dobj))
    print("Mean:", analysis.mean(['thing1', 'thing3'], dobj))
    print("Standard deviation:", analysis.stdev(['thing1', 'thing3'], dobj))
    print("Normalize columns separately:",
          analysis.normalize_columns_separately(['thing1', 'thing3'], dobj))
    print("Normalize columns together:",
          analysis.normalize_columns_together(['thing1', 'thing3'], dobj))
Пример #4
0
	def buildLinearRegression(self,headers):
		
		
		normalized = analysis.normalize_columns_separately( headers, self.data )
		
		list = normalized.tolist()
		for row in range(len(list)):
			list[row].append(0)
			list[row].append(1)
		normalized = np.matrix(list)	
		self.points = normalized
		vtm = self.view.build()
		pts = (vtm * self.points.T).T
		for i in range( pts.shape[0] ):
			row = pts.tolist()[i]
			dx = 3
			dy = 3
			
			
			
			if self.shapeOption.get() == "Dot":		
				pt = self.canvas.create_oval( row[0]-dx, row[1]-dx, row[0]+dx, row[1]+dx,
											  fill=self.colorOption.get(), outline='', tags="data" )
				self.dataObjects.append(pt)							  
				self.objects.append(pt)
										  
			
			elif self.shapeOption.get() == "Square":
				pt = self.canvas.create_rectangle( row[0]-dx, row[1]-dx, row[0]+dx, row[1]+dx,
											  fill=self.colorOption.get(), outline='', tags ="data" )
				self.dataObjects.append(pt) 
				self.objects.append(pt)
		
		unnormalized = self.data.get_data(headers).T.tolist()	
		regress_output = scipy.stats.linregress(unnormalized[0],unnormalized[1])
		m = round(regress_output[0],3)
		b = round(regress_output[1], 3)
		r = round(regress_output[2]*regress_output[2], 3)
		ranges = analysis.data_range(headers,self.data)
		xmin = ranges[0][0]
		xmax = ranges[0][1]
		ymin = ranges[1][0]
		ymax = ranges[1][1]
		pt1 = [0.0, ((xmin * m + b) - ymin)/(ymax - ymin),0,1 ]
		pt2 = [1.0, ((xmax * m + b) - ymin)/(ymax - ymin),0,1 ]
		print "point1"
		print pt1
		print "point2"
		print pt2		
		self.regressionMatrix = np.matrix([pt1,pt2])	
		pts = (vtm * self.regressionMatrix.T).T
		print pts
		best_fit = self.canvas.create_line(pts[0,0],pts[0,1],pts[1,0],pts[1,1], width=3, fill='gold',tags="data")
		self.regressionLines.append(best_fit)
		self.label['text'] = "The best fit line equation:\n y = " + str(m) + "x + " + str(b)+"\n\nR^2 value: " + str(r)
Пример #5
0
def test(filename):
    data = Data(filename)
    data.addColumn('enumstuff3', 'enum', [
        'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'aa', 'aaa', 'a', 'a',
        'a', 'aa'
    ])
    data.addColumn('numberstuff3', 'numeric',
                   [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 3, 3, 4, 5])
    print(data.get_data())
    data.__str__()
    print(
        an.data_range([data.get_headers()[0],
                       data.get_headers()[1]], filename))
    print(an.mean([data.get_headers()[0], data.get_headers()[1]], filename))
    print(an.stdev([data.get_headers()[0], data.get_headers()[1]], filename))
    print(
        an.normalize_columns_seperately(
            [data.get_headers()[0],
             data.get_headers()[1]], filename))
    print(
        an.normalize_columns_together(
            [data.get_headers()[0],
             data.get_headers()[1]], filename))
Пример #6
0
	def buildLinearRegression(self, independent, dependent):
		dx = 5
		dy = 5

		#task5.1
		#Extract Results and Assign them to Variables
		xvar = independent
		yvar = dependent
		#normalize columns separately
		a = analysis.normalize_columns_separately([xvar], self.data)
		b = analysis.normalize_columns_separately([yvar], self.data)
		c = np.hstack((a, b))

		#task5.2
		#add a third column of zeros to the matrix
		z1 = np.zeros((self.data.get_num_rows(), 1))
		d = np.hstack((c, z1))

		#task5.3
		#add a fourth column of zeros to the matrix
		z2 = np.ones((self.data.get_num_rows(), 1))
		self.data2matrix = np.hstack((d, z2))

		#task5.4
		#build the VTM
		vtm = self.view.build()
		#multiply it by data points
		tp = (vtm*self.data2matrix.T).T

		#build points
		for i in range(tp.shape[0]):
			tx = tp[i, 0]
			ty = tp[i, 1]

			pt = self.canvas.create_oval(tx - dx, ty - dy, tx + dx, ty + dy,
												 fill="black", outline='')
			self.objects.append(pt)

		#task5.5
		#calculate linear regression
		xy = self.data.get_data([xvar,yvar])
		#yu = self.data.get_data([yvar])
		###help from Theo S.
		slope, intercept, r_value, p_value, r2 = sc.linregress(xy)
		print slope, intercept, r2
		
		#task5.6
		#get range
		xrange = analysis.data_range([xvar], self.data)
		yrange = analysis.data_range([yvar], self.data)
		
		#task5.7
		#make endpoints
		value1 = ((xrange[0][0] * slope + intercept) - yrange[0][0]) / (yrange[0][1] - yrange[0][0])
		value2 = ((xrange[0][1] * slope + intercept) - yrange[0][0]) / (yrange[0][1] - yrange[0][0])
		print "hi"
		self.LRendpoints = np.matrix([ [0, value1, 0, 1],
										[1, value2, 0, 1] ])
										
		#task5.8
		#multiply the line endpoints by the vtm, 
		#then make tk obj out of endpoints
		points = (vtm * self.LRendpoints.T).T		
		self.regLine = self.canvas.create_line(points[0,0], points[0,1], points[1,0], points[1,1], fill= "Red", width = 3)
		self.LRobjects.append(self.regLine)
		
		#task5.9
		self.lineLabel = tk.Label(self.canvas, text = "Linear Regression:" + str(slope))
		self.lineLabel.place(x=points[1,0], y=points[1,1])
Пример #7
0
	def buildLinearRegression(self,headers):

		norm = an.normalize_columns_separately(self.data, headers)
		zeromatrix = np.zeros(norm.shape[0])
		onesmatrix = np.ones(norm.shape[0])

		# x and y are automatically first two dimensions
		xdatahead = headers[0]
		ydatahead = headers[1]


		if xdatahead != None and ydatahead != None:
			dmatrix = np.matrix(norm)
			nmatrix = np.matrix((zeromatrix, onesmatrix)).T
			self.dataPointMatrix = np.hstack((dmatrix, nmatrix))

		vtm = self.v.build()
		pts = (vtm * self.dataPointMatrix.T).T

		for i in range(pts.shape[0]):
			x = pts[i, 0]
			y = pts[i, 1]
			dx = 5
			pt = self.canvas.create_oval(x - dx, y - dx, x + dx, y + dx,
										 fill='blue', outline='')
			self.objects.append(pt)

		xdata=np.array(self.data.get_data([xdatahead]).T)[0]
		ydata=np.array(self.data.get_data([ydatahead]).T)[0]

		slope, intercept, r_value, p_value, slope_std_error = st.linregress(xdata,ydata)
		predict_y = intercept + slope * xdata
		pred_error = ydata - predict_y
		degrees_of_freedom = len(xdata) - 2
		r2_value=r_value*r_value
		residual_std_error = np.sqrt(np.sum(pred_error ** 2) / degrees_of_freedom)

		rangex =an.data_range(self.data,[xdatahead])
		rangey=an.data_range(self.data,[ydatahead])

		yend0=((rangex[0,0]*slope+intercept)-rangey[0,0])/(rangey[0,1]-rangey[0,0])
		yend1=((rangex[0,1]*slope+intercept)-rangey[0,0])/(rangey[0,1]-rangey[0,0])

		print "minx", rangex[0,0]
		print "maxx", rangex[0,1]
		print "miny", rangey[0,0]
		print "maxy", rangey[0,1]

		linemtrxcol1=np.matrix([[0.0],[yend0],[0.0],[1.0]])
		linemtrxcol2=np.matrix([[1.0],[yend1],[0.0],[1.0]])
		self.linRegEndpoints=np.hstack((linemtrxcol1,linemtrxcol2))
		print "vtm", vtm
		print "linRegEndpoints", self.linRegEndpoints
		le=vtm*self.linRegEndpoints
		print "le", le
		
		self.linRegLines.append(self.canvas.create_line(le[0, 0], le[1, 0], le[0, 1], le[1, 1], fill="red", tags="X"))


		self.statslabel.delete('1.0', tk.END)
		self.statslabel.insert(tk.END, "Slope: "+str(slope) + " " + "Intercept: " + str(intercept)+ " " + "r^2 value: "+ str(r2_value))
Пример #8
0
    for header in nheaders[1:]:
        s += ", " + header
    print(s)

    print("\nNumeric Matrix:")
    print(dobj.numeric_matrix)

    # print out the types
    print("\nTypes:")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    r = analysis.data_range(headers, dobj)
    print("Data Range:\n ", r)
    mean = analysis.mean(headers, dobj)
    print("Mean: \n", mean)

    std = analysis.stdev(headers, dobj)
    print("Standard Deviation: \n", std)

    #std = analysis.stdev(headers, dobj)
    #print("Standard Deviation: \n", std)

    nor_m1 = analysis.normalize_columns_separately(headers, dobj)
    print("Normalized Columns Separately: \n", nor_m1)

    nor_m2 = analysis.normalize_columns_together(headers, dobj)
    print("Normalized Columns Together: \n", nor_m2)
Пример #9
0
    def build_3d_linear_regression(self, independent_variables,
                                   dependent_variable):
        self.plot = analysis.normalize_columns_separately([
            independent_variables[0], independent_variables[1],
            dependent_variable
        ], self.data)

        # self.plot = self.data.limit_columns([independent_variable, dependent_variable])
        self.plot = np.hstack((self.plot, np.ones((self.plot.shape[0], 1))))

        # build the view matrix and transform the points
        vtm = self.view.build()
        pts = self.plot * vtm  # (vtm * self.plot.T).T

        # initialize self.size so that our movement functions don't break
        self.size = []
        # make a graphical point for each data point
        for i in range(len(pts)):
            self.size.append(1)
            x = pts[i, 0]
            y = pts[i, 1]
            pt = self.canvas.create_oval(int(x - 1),
                                         int(y - 1),
                                         int(x + 1),
                                         int(y + 1),
                                         fill="black",
                                         outline='')
            self.points.append(pt)

        linres = analysis.linear_regression(self.data, independent_variables,
                                            dependent_variable)
        slope0 = linres[0]
        slope1 = linres[1]

        intercept = linres[2]
        rvalue = linres[4]

        xmin = analysis.data_range([independent_variables[0]], self.data)[0][0]
        xmax = analysis.data_range([independent_variables[0]], self.data)[0][1]
        ymin = analysis.data_range([independent_variables[1]], self.data)[0][0]
        ymax = analysis.data_range([independent_variables[1]], self.data)[0][1]
        zmin = analysis.data_range([dependent_variable], self.data)[0][0]
        zmax = analysis.data_range([dependent_variable], self.data)[0][1]

        xends = [0.0, 1.0]
        yends = [
            ((xmin * slope0[0, 0] + intercept[0, 0]) - ymin) / (ymax - ymin),
            ((xmax * slope0[0, 0] + intercept[0, 0]) - ymin) / (ymax - ymin)
        ]
        zends = [
            ((xmin * slope1[0, 0] + intercept[0, 0]) - zmin) / (zmax - zmin),
            ((xmax * slope1[0, 0] + intercept[0, 0]) - zmin) / (zmax - zmin)
        ]

        self.regression_endpoints = np.matrix([[0.0, 1.0],
                                               [yends[0], yends[1]],
                                               [zends[0], zends[1]], [1, 1]])

        print("self.regression_endpoints", self.regression_endpoints)
        self.line_of_fit = (self.canvas.create_line(
            self.regression_endpoints[0, 0],
            self.regression_endpoints[1, 0],
            self.regression_endpoints[0, 1],
            self.regression_endpoints[1, 1],
            fill="red"))

        self.regression_lines.append(self.line_of_fit)
        self.fit_label = tk.Label(self.canvas,
                                  text="slope0: " + str(slope0[0, 0]) +
                                  "\nslope1: " + str(slope1[0, 0]) +
                                  "\nIntercept: " + str(intercept[0, 0]) +
                                  "\nR-value: " + str(rvalue))
        self.fit_label.place(x=self.regression_endpoints[0, 1],
                             y=self.regression_endpoints[1, 1])
        self.updateAxes()
        self.updateFits()
        self.updatePoints()
Пример #10
0
    print("String Matrix Representation")
    data.__str__(False)
    print()

    print("Subsets of Matrix\n")
    col = [1]
    row = [2, 4]
    print("All rows with Column Subset")
    print(data.subset(col), "\n")
    print("All columns with Row Subset")
    print(data.subset(rows=row), "\n")
    print("Subset Rows and Columns")
    print(data.subset(col, row), "\n")

    print("Range of Numeric Data")
    print(analysis.data_range(data, data.get_headers()), "\n")

    print("IQR of the Numeric Columns")
    print(analysis.data_iqr(data, data.get_headers()), "\n")

    print("Mean of the Numeric Columns")
    print(analysis.data_mean(data, data.get_headers()), "\n")

    print("Median of the Numeric Columns")
    print(analysis.data_median(data, data.get_headers()), "\n")

    print("StDev of the Numeric Columns")
    print(analysis.data_stdev(data, data.get_headers()), "\n")

    print("Variance of the Numeric Columns")
    print(analysis.data_variance(data, data.get_headers()), "\n")
Пример #11
0
	def buildLinearRegression(self):
		#self.uniqueColors = False
		if self.gRegressLine is not None:
			self.canvas.delete(self.gRegressLine)
			self.canvas.delete(self.glinText)
		self.gRegressLine = None
		temp_matrix = analysis.normalize_columns_separately(self.data, self.dataheaders)
		self.rows = len(temp_matrix)
		if len(self.dataheaders) == 2:
			temp_matrix = np.hstack((temp_matrix, np.zeros(shape=(self.rows,1))))
		self.dataMatrix = np.hstack((temp_matrix, np.ones(shape=(self.rows,1))))
		self.buildAxes()
		if len(self.dataheaders) == 2: 
			slope, self.intercept, r_value, self.p_value, self.std_err = scipy.stats.linregress(self.data.get_data(self.dataheaders))
			self.slope.append(slope)
			self.r_squared = r_value**2
			data_range = analysis.data_range(self.data, self.dataheaders)
			high = ((data_range[0][0]*self.slope[0] + self.intercept)-data_range[1][1])/(data_range[1][0]-data_range[1][1])
			low =  ((data_range[0][1]*self.slope[0] + self.intercept)-data_range[1][1])/(data_range[1][0]-data_range[1][1])
			#print low,high
			self.endpoints = np.matrix([[0, low, 0, 1],
											[1, high, 0, 1]])
			vtm = self.view.build()
			pts = (vtm * self.endpoints.T).T
			self.gRegressLine = self.canvas.create_line(pts[0,0], pts[0,1], pts[1,0], pts[1,1], fill = "red")
			linText = ("Slope: %.3f, Intercept: %.3f, R Squared: %.3f"%(slope, self.intercept, r_value**2))
			self.glinText = self.canvas.create_text(pts[1,0], pts[1,1], text = linText)
		else:
			regressstuffs = analysis.linear_regression(self.data, self.dataheaders[:2], [self.dataheaders[2],])
			self.intercept = regressstuffs[0][0]
			self.slope.append(regressstuffs[0][1])
			self.slope.append(regressstuffs[0][2])
			self.std_err = regressstuffs[1]
			self.r_squared = regressstuffs[2]
			self.p_value = regressstuffs[4]
			#print intercept
			
			data_range = analysis.data_range(self.data, self.dataheaders)
			highx0 = ((data_range[0][0]*self.slope[0] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1])
			lowx0 =  ((data_range[0][1]*self.slope[0] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1])
			#print lowx0, highx0

			highx1 = ((data_range[1][0]*self.slope[1] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1])
			lowx1 =  ((data_range[1][1]*self.slope[1] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1])
			#print lowx1,highx1
			
			#x1 goes in the x direction, x2 in y, dep goes in Z
			self.endpoints = np.matrix([[0, 0, lowx0, 1],
										[1, 0, highx0, 1],
										[0, 0, lowx1, 1],
										[0, 1, highx1, 1]])
			vtm = self.view.build()
			pts = (vtm * self.endpoints.T).T
			#print pts
			#self.gRegressLine = self.canvas.create_rectangle(pts[0,0],pts[2,1],pts[1,0],pts[3,1])
			self.gRegressLines = []
			#I made each line in the plane a different color because I wasn't sure if things were working right so I wanted to be able to differentiate them
			#I think this should be a 3D visualization of the linear regression, but I might have done something horribly wrong(it seems to work as a plane) for 
			self.gRegressLines.append(self.canvas.create_line(pts[0,0], pts[0,1], pts[1,0], pts[1,1], fill = "red"))
			self.gRegressLines.append(self.canvas.create_line(pts[2,0], pts[2,1], pts[3,0], pts[3,1], fill = "green"))
			self.gRegressLines.append(self.canvas.create_line(pts[0,0], pts[0,1], pts[2,0], pts[2,1], fill = "blue"))
			self.gRegressLines.append(self.canvas.create_line(pts[1,0], pts[1,1], pts[3,0], pts[3,1], fill = "black"))
			#self.gRegressLine = self.canvas.create_polygon(pts[0,0],pts[0,1],
			#												pts[1,0], pts[1,1],
			#												pts[2,0], pts[2,1],
			#												pts[3,0], pts[3,1], fill = '', outline = "red")
			linText = ("X0 Slope: %.3f, X1 Slope: %.3f, Intercept: %.3f, R Squared: %.3f"%(self.slope[0], self.slope[1], self.intercept, self.r_squared))
			self.glinText = self.canvas.create_text(pts[1,0], pts[1,1], text = linText)
Пример #12
0
	def buildAxes(self):
		self.resetData()
		vtm = self.view.build()
		#print self.axes.T
		pts = (vtm * self.axes.T).T
		#create tick marks
		xpts = (vtm*self.xticks.T).T
		ypts = (vtm*self.yticks.T).T
		zpts = (vtm*self.zticks.T).T
		
		if self.dataMatrix is not None:
			#length =  len(self.dataMatrix.T)
			#temp_matrix = self.dataMatrix.T[[0,1,2,-1]]
			#print temp_matrix
			datapts = (vtm*self.dataMatrix.T).T
		#number2letter= {0: 'X', 1: 'Y', 2: 'Z'}
		number2axes= {0: [0, 10], 1: [10, 0], 2: [5,5]}
		
		for i in range(3):
			axis = self.canvas.create_line(pts[2*i,0], pts[2*i,1], pts[2*i+1,0], pts[2*i+1,1])
			self.gAxes.append(axis)
			
		for i in range(len(self.dataheaders)):
			text = self.canvas.create_text(pts[2*i+1,0]+number2axes[i][0]*1/self.view.extent[0,0], pts[2*i+1,1]+number2axes[i][1]*1/self.view.extent[0,1], text = self.dataheaders[i])
			self.gLabels.append(text)
		
		if self.dataMatrix is not None:
			#this part is so much more difficult incorporating both
			if self.dataheaders[0] in self.data.get_headers():
				xdrange = analysis.data_range(self.data, (self.dataheaders[0],)) 
			else:
				xdrange = analysis.data_range(self.PCA, (self.dataheaders[0],)) 
			xrange = xdrange[0][0] - xdrange[0][1]
			
			if self.dataheaders[1] in self.data.get_headers():
				ydrange = analysis.data_range(self.data, (self.dataheaders[1],)) 
			else:
				ydrange = analysis.data_range(self.PCA, (self.dataheaders[1],)) 
			yrange = ydrange[0][0] - ydrange[0][1]
			
			if len(self.dataheaders) >2:
				if self.dataheaders[2] in self.data.get_headers():
					zdrange = analysis.data_range(self.data, (self.dataheaders[2],)) 
				else:
					zdrange = analysis.data_range(self.PCA, (self.dataheaders[2],)) 
				zrange = zdrange[0][0] - zdrange[0][1]
				
			number2xlabel = {0:"%.2f"%(xdrange[0][1]+xrange/4.0), 1: "%.2f"%(xdrange[0][1]+2*xrange/4.0), 2: "%.2f"%(xdrange[0][1]+3*xrange/4.0), 3:"%.2f"%(xdrange[0][0])}
			number2ylabel = {0:"%.2f"%(ydrange[0][1]+yrange/4.0), 1: "%.2f"%(ydrange[0][1]+2*yrange/4.0), 2: "%.2f"%(ydrange[0][1]+3*yrange/4.0), 3:"%.2f"%(ydrange[0][0])}
			if len(self.dataheaders) >2:
				number2zlabel = {0:"%.2f"%(zdrange[0][1]+zrange/4.0), 1: "%.2f"%(zdrange[0][1]+2*zrange/4.0), 2: "%.2f"%(zdrange[0][1]+3*zrange/4.0), 3:"%.2f"%(zdrange[0][0])}
			for i in range(4):
				tick = self.canvas.create_line(xpts[2*i,0], xpts[2*i,1], xpts[2*i+1,0], xpts[2*i+1,1])
				self.gXticks.append(tick)
				text = self.canvas.create_text(xpts[2*i+1,0], xpts[2*i+1,1], text= number2xlabel[i])
				self.gXlabels.append(text)
				
				tick = self.canvas.create_line(ypts[2*i,0], ypts[2*i,1], ypts[2*i+1,0], ypts[2*i+1,1])
				self.gYticks.append(tick)
				text = self.canvas.create_text(ypts[2*i+1,0], ypts[2*i+1,1], text= number2ylabel[i])
				self.gYlabels.append(text)
				
				if len(self.dataheaders) >2:
					tick = self.canvas.create_line(zpts[2*i,0], zpts[2*i,1], zpts[2*i+1,0], zpts[2*i+1,1])
					self.gZticks.append(tick)
					text = self.canvas.create_text(zpts[2*i+1,0], zpts[2*i+1,1], text= number2zlabel[i])
					self.gZlabels.append(text)
				
		
		if self.dataMatrix is not None:
			rows =self.rows
			lenx = 3*1.0/self.view.extent[0,0]
			leny = 3*1.0/self.view.extent[0,0]
			for i in range(rows):
				dx = 1
				r = 0.5*255
				g = 0.5*255
				b = 0.5*255
				color ="#%02x%02x%02x" %(r,g,b)
				if self.colorResult is not None:
					if self.colorVar.get() == 1:
						color = self.colors[int(self.colorMatrix[i,0])]
						#print self.colorMatrix[i,0]
					else:
						if self.colorResult in self.data.get_headers():
							dataRange = analysis.data_range(self.data, (self.colorResult,))
						elif self.colorResult in self.PCA.get_headers():
							dataRange = analysis.data_range(self.PCA, (self.colorResult,))
						else:
							print "something is wrong, your color does not exist"
						middle = (dataRange[0][0]+dataRange[0][1])/2
						alpha = 1.0/(1.0+math.e**(-10*(self.colorMatrix[i,0]-0.5))) 
						#alpha = 1.0/(1.0+math.e**(-(1.0/dataRange[0][1])*(self.colorMatrix[i,0]-middle)))
						#print alpha
						r = (1.0- alpha)*255
						g = (1.0 - alpha) *255
						b = alpha*255
						color ="#%02x%02x%02x" %(r,g,b)
				if self.sizeResult is not None:
					dx = self.sizeMatrix[i,0]*5
					#print dx
				self.objects.append(self.canvas.create_oval(datapts[i,0]-lenx*dx, 
										datapts[i,1]-leny*dx,
										datapts[i,0] + lenx*dx,
										datapts[i,1]+leny*dx,
										fill = color,
										outline ='',))
		#self.canvas.postscript(file = "imag%d.ps"%(self.n) , colormode = 'color')
		#self.n+=1
		self.scaleText.set("%.2f, %.2f"%(self.view.extent[0,0], self.view.extent[0,1]))
		self.resetAxes()
Пример #13
0
 def buildLinearRegression(self, indx, indz, dep, export, filename):
     if (indz != ''):
         matrix = analysis.normalize_columns_separately(self.data, [indx, dep, indz])
     else:
         matrix = analysis.normalize_columns_separately(self.data, [indx, dep])
         zeros = np.zeros(self.data.get_raw_num_rows())
         matrix = np.hstack( (matrix, np.matrix(zeros).T) )
         
     ones = np.ones(self.data.get_raw_num_rows())
     self.dataMatrix = np.hstack( (matrix, np.matrix(ones).T) )
     # calculate view coordinates
     vtm = self.view.build()
     pts = (vtm * self.dataMatrix.T).T
     
     # use points with default size and color
     self.sizes = [2]*self.data.get_raw_num_rows()
     self.colors = ['#000000']*self.data.get_raw_num_rows()
     for i in range(len(pts)):
         pt = self.canvas.create_oval(pts[i, 0]-self.sizes[i], 
                     pts[i, 1]-self.sizes[i], pts[i, 0]+self.sizes[i], 
                     pts[i, 1]+self.sizes[i], fill=self.colors[i], outline='')
         self.objects.append(pt)
     
     # calculate single variable linear regression
     if (indz == ''):
         slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
                                     self.data.get_data([indx, dep]))
         ranges = analysis.data_range(self.data, [indx, dep])
         end1y = ((ranges[0][0]*slope+intercept)-ranges[1][0])/(ranges[1][1]-ranges[1][0])
         end2y = ((ranges[0][1]*slope+intercept)-ranges[1][0])/(ranges[1][1]-ranges[1][0])
         self.regressionMatrix = np.matrix([ [0.0, end1y, 0.0, 1.0],
                                             [1.0, end2y, 0.0, 1.0] ])
                                             
         eqn = "y = %.3fx + %.3f \nR = %.3f" % (slope, intercept, r_value)
         data = "p = %.3f \nStandard error = %.3f" % (p_value, std_err)
         out = eqn + "\n" + data
         
     # calculate muliple variable linear regression
     else:
         b, sse, r2, t, p = analysis.linear_regression(self.data, [indx, indz], dep)
         ranges = analysis.data_range(self.data, [indx, indz, dep])
         end1y = ranges[0][0]*b[0] + ranges[1][0]*b[1] + b[2]
         end1y = (end1y - ranges[2][0])/(ranges[2][1] - ranges[2][0])
         end2y = ranges[0][1]*b[0] + ranges[1][1]*b[1] + b[2]
         end2y = (end2y - ranges[2][0])/(ranges[2][1] - ranges[2][0])
         self.regressionMatrix = np.matrix([ [0.0, end1y, 0.0, 1.0],
                                             [1.0, end2y, 1.0, 1.0] ])
                                             
         eqn =  "y = %.3fx + %.3fz + %.3f \nR^2 = %.3f" % (b[0], b[1], b[2], r2)
         sse_data = "Sum-squared error = %.3f" % (sse)
         p_data = "p = [%.3f, %.3f, %.3f]" % (p[0, 0], p[0, 1], p[0, 2])
         t_data = "t-statistic = [%.3f, %.3f, %.3f]" % (t[0, 0], t[0, 1], t[0, 2])
         out = eqn + "\n" + sse_data + "\n" + p_data + "\n" + t_data
         
     # display regression onscreen
     self.canvas.itemconfig(self.labels[0], text="x")
     self.canvas.itemconfig(self.labels[1], text="y")
     self.canvas.itemconfig(self.labels[2], text="z")
     endpts = (vtm * self.regressionMatrix.T).T
     l = self.canvas.create_line(endpts[0, 0], endpts[0, 1], endpts[1, 0], 
                                     endpts[1, 1], fill="red")
     self.regressionObjects.append(l)
     regLabel = self.canvas.create_text(endpts[1, 0]+120, endpts[1, 1]+20, text=eqn)
     self.labels.append(regLabel)
     title = "Linear regression for " + str(self.fn)
 
     # write linear regression function to file
     if (export == 1):
         file = open(filename + ".txt", 'w')
         file.write(title + "\n" + out)
         file.close()
Пример #14
0
def main():
    numpy.set_printoptions(suppress=True)
    print("\n----- Database Info -----")
    if len(sys.argv) < 2:
        print('Usage: python %s <csv filename>' % (sys.argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = data.Data(sys.argv[1])
    print("\nName: ", dobj.get_filename())
    # print out information about the dat
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of numeric columns: ', dobj.get_num_dimensions())

    # print out the headers
    print("\nHeaders:")
    headers = dobj.get_headers()
    s = headers[0]
    for header in headers[1:]:
        s += ", " + header
    print(s)

    # print out the headers
    print("\nNumeric Headers:")
    nheaders = dobj.get_numericheaders()
    s = nheaders[0]
    for header in nheaders[1:]:
        s += ", " + header
    print(s)

    # print out the types
    print("\nTypes:")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    r = analysis.data_range(headers, dobj)
    print("Data Range:\n ", r)
    mean = analysis.mean(headers, dobj)
    print("Mean: \n", mean)

    std = analysis.stdev(headers, dobj)
    print("Standard Deviation: \n", std)
    if headers == nheaders:
        nor_m1 = analysis.normalize_columns_separately(headers, dobj)
        print("Normalized Columns Separately: \n", nor_m1)
    if headers == nheaders:
        nor_m2 = analysis.normalize_columns_together(headers, dobj)
        print("Normalized Columns Together: \n", nor_m2)

    s = analysis.sumup(headers, dobj)
    print("Sum:\n", s)

    print("Variance:\n", analysis.variance(headers, dobj))

    # EXTENSION5 ADD COLUMN
    dobj.add_colummn('new col', 'numeric',
                     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
    print(
        "\nAdd new column: 'new col','numeric', [1,2,3,4,5,6,7,8,9,10,11,12,13,14]"
    )
    print("----- New Matrix: -----")
    m = dobj.get_whole_matrix()
    print(m)
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of numeric columns: ', dobj.get_num_dimensions())
    print("---------------------------------")

    # EXTENSION6 WRITE TO A CSV file
    a = numpy.asarray(m)
    with open('foo.csv', 'w') as outputfile:
        wr = csv.writer(outputfile, delimiter=',')
        wr.writerow(dobj.get_headers())
        wr.writerow(dobj.get_types())
        for ls in a:
            wr.writerow(ls)