def test_series_std(ddof): np.random.seed(0) arr = np.random.random(100) - 0.5 sr = Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) expect = pd.std(ddof=ddof) np.testing.assert_approx_equal(expect, got)
def standardize(data): c_data = subtract_mean(data) std_data = c_data / pd.std(c_data) return std_data
plt.show() #look at a scatter plot of the data, look for any erroneous points/outliers plt.figure() plt.scatter(<data_raw_1>, <data_raw_2>) plt.title("Raw Data") plt.xlabel("X label") plt.ylabel("Y label") plt.show() #set the bounds on the data set for cleaning and repeat for all columns data_clean[column] = data_raw[column][<lower_bound> <= data_raw[column] <= <upper_bound>] data_clean.head() data_clean.to_pickle("path") #CAUTION!!! Only read pickles that YOU generate!!!!! No Exceptions! #now get the avg, std dev, max, min, mean = pd.mean(data_clean[column]) std = pd.std(data_clean[column]) min = pd.min(data_clean[column]) max = pd.max(data_clean[column]) print("Mean: %f, Stand Dev: %f, Minimum: %f, Maximum: %f").format(mean, std, min, max) #now generate plots using clean data and save fig spec dir plt.figure() plt.<plot type>(<data_raw_1>, <data_raw_2>) #hist, scatter, plot, box plt.title("Data") #describe what the plot is plt.xlabel("X label") plt.ylabel("Y label") plt.savefig("path-to-directory.png") #pick a path that you know you'll find it plt.show()