Exemplo n.º 1
0
def main():
    d = Data('cars.csv')
    print "Raw Headers"
    print d.get_raw_headers()
    print "\n\n"
    print "Raw number of columns"
    print d.get_raw_num_columns()
    print "\n\n"
    print "Raw number of rows"
    print d.get_raw_num_rows()
    print "\n\n"
    print "13th row"
    print d.get_raw_row(13)
    print "\n\n"
    print "Value at row 6, header 'Car'"
    print d.get_raw_value(6, 'Car')
    print "\n\n"
    print "Matrix data"
    print d.matrix_data
    print "\n\n"
    print "Headers"
    print d.get_headers()
    print "\n\n"
    print "Number of cols"
    print d.get_num_columns()
    print "\n\n"
    print "5th row"
    print d.get_row(5)
    print "\n\n"
    print "Get value"
    print d.get_value(5, 'Horsepower')
    print "\n\n"
    print "get_data function"
    print d.get_data(['Origin', 'Horsepower'])
    print "\n\n"
    print "data range"
    print analysis.data_range(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "mean of horsepower and origin"
    print analysis.mean(d, ['Horsepower', 'Origin'])
    print "\n\n"
    print "standard deviation for horsepower and origin"
    print analysis.stdev(d, ['Horsepower', 'Origin'])
    print "\n"
    print "normalized columns origin and horsepower"
    print analysis.normalize_columns_separately(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "normalized together origin and horsepower"
    print analysis.normalize_columns_together(d, ['Origin', 'Horsepower'])
    print "\n\n"
    print "median of columns origin, horspower and weight"
    print analysis.median(d, ['Origin', 'Horsepower', 'Weight'])
    print d.get_data(['Origin', 'Horsepower']).shape
Exemplo n.º 2
0
	def build( self, A, categories ):
		'''Builds the classifier give the data points in A and the categories'''
		A = np.matrix(A)
		# figure out how many categories there are and get the mapping (np.unique)
		unique, mapping = np.unique( np.array(categories.T), return_inverse=True)
		self.num_classes = unique.size
		self.class_labels = unique
		#print A.shape[1]
		self.num_features = A.shape[1]

		# create the matrices for the means, vars, and scales
		# the output matrices will be categories (C) x features (F)
		self.class_means  = np.matrix(np.zeros((self.num_classes, self.num_features)))
		self.class_vars	  = np.matrix(np.zeros((self.num_classes, self.num_features)))
		self.class_scales = np.matrix(np.zeros((self.num_classes, self.num_features)))

		# compute the means/vars/scales for each class
		for i in range(self.num_classes):
			#print an.mean(None,None,matrix = A[(mapping==i),:])
			self.class_means[i,:] = an.mean(None,None,matrix = A[(mapping==i),:])
			self.class_vars[i,:] = np.var(A[(mapping==i),:], axis=0, ddof=1)

		for i in range(self.class_scales.shape[0]):
			for j in range(self.class_scales.shape[1]):
				self.class_scales[i, j] = (1/np.sqrt (2 * np.pi * self.class_vars[i,j]))
		# store any other necessary information: # of classes, # of features, original labels
		return
Exemplo n.º 3
0
def main(argv):

    # test command line arguments
    if len(argv) < 2:
        print('Usage: python %s <csv filename>' % (argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = data.Data(argv[1])

    # print out information about the data
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of columns: ', dobj.get_num_dimensions())

    # print out the headers
    print("\nHeaders:")
    headers = dobj.get_headers()
    s = headers[0]
    for header in headers[1:]:
        s += ", " + header
    print(s)

    # print out the types
    print("\nTypes")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    # print out a single row
    print("\nPrinting row index 2")
    print(dobj.get_row(2))

    # print out all of the data
    print("\nData")
    headers = dobj.get_headers()
    print("headers:", headers)
    for i in range(dobj.get_num_points()):
        s = str(dobj.get_value(headers[0], i))
        for header in headers[1:]:
            s += "%10.3s" % (dobj.get_value(header, i))
        print(s)

    print("\n\n\n\nselect_columns")

    d = dobj.get_data()
    # print("Data:", d)
    s = dobj.select_columns(['thing1', 'thing3'])
    print("Selected columns:", s)

    print("Data range:", analysis.data_range(['thing1', 'thing3'], dobj))
    print("Mean:", analysis.mean(['thing1', 'thing3'], dobj))
    print("Standard deviation:", analysis.stdev(['thing1', 'thing3'], dobj))
    print("Normalize columns separately:",
          analysis.normalize_columns_separately(['thing1', 'thing3'], dobj))
    print("Normalize columns together:",
          analysis.normalize_columns_together(['thing1', 'thing3'], dobj))
Exemplo n.º 4
0
def main(argv):

    # test command line arguments
    if len(argv) < 2:
        print('Usage: python %s <csv filename>' % (argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = Data(argv[1])
    headers = dobj.get_headers()
    #test the five analysis functions
    print([headers[0], headers[2]])
    print("Data range by column:",
          analysis.data_range([headers[0], headers[2]], dobj))
    print("Mean:", analysis.mean([headers[0], headers[2]], dobj))
    print("Standard deviation:", analysis.stdev([headers[0], headers[2]],
                                                dobj))
    print(
        "Normalize columns separately:",
        analysis.normalize_columns_separately([headers[0], headers[2]], dobj))
    print("Normalize columns together:",
          analysis.normalize_columns_together([headers[0], headers[2]], dobj))

    #Extension 1
    print("Median:", analysis.median([headers[0], headers[2]], dobj))

    #Extension 2
    print("Median Separately:",
          analysis.median_separately([headers[0], headers[2]], dobj))

    #Extension 3
    print("just  few rows:", dobj.limit_rows())

    #Extension 4
    print(
        "just a few columns. I changed the limit to 2 for demonstration purposes:",
        dobj.limit_columns())

    #Extension 5
    print("Data range overall:",
          analysis.data_range([headers[0], headers[2]], dobj, True))

    #Extension 6
    print(
        "The next two print statements get the last row of data. I add a row of data in between,"
        "so they are different.")
    print(dobj.get_row(-1))
    dobj.add_point([1, 2, 3])
    print(dobj.get_row(-1))
Exemplo n.º 5
0
def testAnalysis():
    print "\n########################## testAnalysis() ##########################\n"
    d = data.Data("drugdeaths.csv", "data2.csv")
    colHeaders = ["Heroin", "Prescription Opioid", "Alcohol", "Cocaine"]
    print "dataRange(colHeaders, d): ", a.dataRange(colHeaders, d)
    print "mean(colHeaders, d): ", a.mean(colHeaders, d)
    print "stDev(colHeaders, d): ", a.stDev(colHeaders, d)
    #print "normalizeColSeparate(colHeaders, d): ", a.normalizeColSeparate(colHeaders, d)
    #print "normalizeColTogether(colHeaders, d): \n", a.normalizeColTogether(colHeaders, d)
    #print colHeaders
    print "\n Normalization of individual columns\n"
    print tabulate(a.normalizeColSeparate(colHeaders, d).tolist(),
                   headers=colHeaders)
    print "\n Normalization of entire matrix\n"
    print tabulate(a.normalizeColTogether(colHeaders, d).tolist(),
                   headers=colHeaders)
    print "correlate(colHeaders, d): ", a.correlate(colHeaders, d)
    print "\n#####################################################################\n"
Exemplo n.º 6
0
def analysis_plot():
    data_plot = analysis.open_data('../src/DataMerge.csv')
    mean_data = analysis.mean(data_plot, 'temperatureMax')
    data_plot1 = analysis.plot1(data_plot,
                                ['Reported Month', 'Region of Incident'],
                                'temperatureMax', 'temperatureMax',
                                'Total Dead and Missing')
    data_plot2 = analysis.plot2(data_plot,
                                ['Reported Month', 'Region of Incident'],
                                'temperatureMin', 'temperatureMin',
                                'Total Dead and Missing')
    selected_columns = analysis.select_columns(
        data_plot, ['Region of Incident', 'temperatureMax', 'Cause of Death'])
    crosstable_plot = analysis.crosstable(
        selected_columns, 'Region of Incident', 'Cause of Death', [
            'Accident', 'Cardiac arrest', 'Dehydration', 'Murdered',
            'Sickness and lack of access to medicines', 'Suffocation',
            'Unknown', 'Weather conditions'
        ])

    return mean_data, data_plot1, data_plot2, crosstable_plot
Exemplo n.º 7
0
def test(filename):
    data = Data(filename)
    data.addColumn('enumstuff3', 'enum', [
        'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'aa', 'aaa', 'a', 'a',
        'a', 'aa'
    ])
    data.addColumn('numberstuff3', 'numeric',
                   [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 3, 3, 4, 5])
    print(data.get_data())
    data.__str__()
    print(
        an.data_range([data.get_headers()[0],
                       data.get_headers()[1]], filename))
    print(an.mean([data.get_headers()[0], data.get_headers()[1]], filename))
    print(an.stdev([data.get_headers()[0], data.get_headers()[1]], filename))
    print(
        an.normalize_columns_seperately(
            [data.get_headers()[0],
             data.get_headers()[1]], filename))
    print(
        an.normalize_columns_together(
            [data.get_headers()[0],
             data.get_headers()[1]], filename))
Exemplo n.º 8
0
    def build(self, A, categories):
        '''Builds the classifier give the data points in A and the categories'''
        A = np.matrix(A)
        # figure out how many categories there are and get the mapping (np.unique)
        unique, mapping = np.unique(np.array(categories.T),
                                    return_inverse=True)
        self.num_classes = unique.size
        self.class_labels = unique
        #print A.shape[1]
        self.num_features = A.shape[1]

        # create the matrices for the means, vars, and scales
        # the output matrices will be categories (C) x features (F)
        self.class_means = np.matrix(
            np.zeros((self.num_classes, self.num_features)))
        self.class_vars = np.matrix(
            np.zeros((self.num_classes, self.num_features)))
        self.class_scales = np.matrix(
            np.zeros((self.num_classes, self.num_features)))

        # compute the means/vars/scales for each class
        for i in range(self.num_classes):
            #print an.mean(None,None,matrix = A[(mapping==i),:])
            self.class_means[i, :] = an.mean(None,
                                             None,
                                             matrix=A[(mapping == i), :])
            self.class_vars[i, :] = np.var(A[(mapping == i), :],
                                           axis=0,
                                           ddof=1)

        for i in range(self.class_scales.shape[0]):
            for j in range(self.class_scales.shape[1]):
                self.class_scales[i, j] = (
                    1 / np.sqrt(2 * np.pi * self.class_vars[i, j]))
        # store any other necessary information: # of classes, # of features, original labels
        return
Exemplo n.º 9
0
    print(s)

    print("\nNumeric Matrix:")
    print(dobj.numeric_matrix)

    # print out the types
    print("\nTypes:")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    r = analysis.data_range(headers, dobj)
    print("Data Range:\n ", r)
    mean = analysis.mean(headers, dobj)
    print("Mean: \n", mean)

    std = analysis.stdev(headers, dobj)
    print("Standard Deviation: \n", std)

    #std = analysis.stdev(headers, dobj)
    #print("Standard Deviation: \n", std)

    nor_m1 = analysis.normalize_columns_separately(headers, dobj)
    print("Normalized Columns Separately: \n", nor_m1)

    nor_m2 = analysis.normalize_columns_together(headers, dobj)
    print("Normalized Columns Together: \n", nor_m2)

    #dobj.add_colummn('new col','numeric', [1,2,3,4,5,6,7,8,9,10,11,12,13,14])
Exemplo n.º 10
0
                    recalls = []

                    for batchEval in nextBatch(evalReviews, evalLabels,
                                               config.batchSize):
                        loss, acc, precision, recall, f_beta = devStep(
                            batchEval[0], batchEval[1])
                        losses.append(loss)
                        accs.append(acc)
                        f_betas.append(f_beta)
                        precisions.append(precision)
                        recalls.append(recall)

                    time_str = datetime.datetime.now().isoformat()
                    dev_result.append({
                        "step": currentStep,
                        "loss": mean(losses),
                        "accs": mean(accs),
                        "precisions": mean(precisions),
                        "recalls": mean(recalls),
                        "f_betas": mean(f_betas)
                    })
                    print(
                        "{}, step: {}, loss: {}, acc: {},precision: {}, recall: {}, f_beta: {}"
                        .format(time_str, currentStep, mean(losses),
                                mean(accs), mean(precisions), mean(recalls),
                                mean(f_betas)))

                if currentStep % config.training.checkpointEvery == 0:
                    # 保存模型的另一种方法,保存checkpoint文件
                    path = saver.save(sess,
                                      config.checkpointSavePath,
Exemplo n.º 11
0
    # # Load the data files into a Data object
    # dataClean = Data(filename='data-clean.csv')
    # dataGood = Data(filename='data-good.csv')
    # dataNoisy = Data(filename='data-noisy.csv')
    #
    # # Run multiple linear regression on the Data objects
    # analysis.testRegression(dataClean)
    # analysis.testRegression(dataGood)
    # analysis.testRegression(dataNoisy)

    data = Data(filename='GOOG-NASDAQ_TSLA.csv')

    # print out some analyses
    print("\n\nDescriptive statistics of Tesla's stock data (daily open and close prices and trading volume:")
    print("Mean: ", analysis.mean(['Open', 'Close', 'Volume'], data))
    print("Standard deviation: ", analysis.stdev(['Open', 'Close', 'Volume'], data))
    print("Ranges: ", analysis.dataRange(['Open', 'Close', 'Volume'], data))
    print("Normalized columns: ", analysis.normalizeColumnsSeparately(['Open', 'Close', 'Volume'], data))
    print("Normalized globally: ", analysis.normalizeColumnsTogether(['Open', 'Close', 'Volume'], data))
    print("Variance: ", analysis.variance(['Open', 'Close', 'Volume'], data))
    print("Median: ", analysis.median(['Open', 'Close', 'Volume'], data))
    print("Mode value: ", analysis.modeValue(['Open', 'Close', 'Volume'], data))
    print("Mode frequency: ", analysis.modeFreq(['Open', 'Close', 'Volume'], data))
    print("Range value: ", analysis.rangeDiff(['Open', 'Close', 'Volume'], data), "\n")

    data.printData(20)

    # manipulate the data to show their efficacy
    data.set_value(0.0001, 5, 'Open')
    data.set_column(data.get_column('Open'), 'Close')
Exemplo n.º 12
0
def main():
    numpy.set_printoptions(suppress=True)
    print("\n----- Database Info -----")
    if len(sys.argv) < 2:
        print('Usage: python %s <csv filename>' % (sys.argv[0]))
        exit(0)

    # create a data object, which reads in the data
    dobj = data.Data(sys.argv[1])
    print("\nName: ", dobj.get_filename())
    # print out information about the dat
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of numeric columns: ', dobj.get_num_dimensions())

    # print out the headers
    print("\nHeaders:")
    headers = dobj.get_headers()
    s = headers[0]
    for header in headers[1:]:
        s += ", " + header
    print(s)

    # print out the headers
    print("\nNumeric Headers:")
    nheaders = dobj.get_numericheaders()
    s = nheaders[0]
    for header in nheaders[1:]:
        s += ", " + header
    print(s)

    # print out the types
    print("\nTypes:")
    types = dobj.get_types()
    s = types[0]
    for type in types[1:]:
        s += ", " + type
    print(s)

    r = analysis.data_range(headers, dobj)
    print("Data Range:\n ", r)
    mean = analysis.mean(headers, dobj)
    print("Mean: \n", mean)

    std = analysis.stdev(headers, dobj)
    print("Standard Deviation: \n", std)
    if headers == nheaders:
        nor_m1 = analysis.normalize_columns_separately(headers, dobj)
        print("Normalized Columns Separately: \n", nor_m1)
    if headers == nheaders:
        nor_m2 = analysis.normalize_columns_together(headers, dobj)
        print("Normalized Columns Together: \n", nor_m2)

    s = analysis.sumup(headers, dobj)
    print("Sum:\n", s)

    print("Variance:\n", analysis.variance(headers, dobj))

    # EXTENSION5 ADD COLUMN
    dobj.add_colummn('new col', 'numeric',
                     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
    print(
        "\nAdd new column: 'new col','numeric', [1,2,3,4,5,6,7,8,9,10,11,12,13,14]"
    )
    print("----- New Matrix: -----")
    m = dobj.get_whole_matrix()
    print(m)
    print('Number of rows:    ', dobj.get_num_points())
    print('Number of numeric columns: ', dobj.get_num_dimensions())
    print("---------------------------------")

    # EXTENSION6 WRITE TO A CSV file
    a = numpy.asarray(m)
    with open('foo.csv', 'w') as outputfile:
        wr = csv.writer(outputfile, delimiter=',')
        wr.writerow(dobj.get_headers())
        wr.writerow(dobj.get_types())
        for ls in a:
            wr.writerow(ls)