def movielens(users, ratings, movies, threads): start = time.time() ratings = gr.DataFrameWeld(ratings) users = gr.DataFrameWeld(users) movies = gr.DataFrameWeld(movies) data = gr.merge(gr.merge(ratings, users), movies).evaluate(workers=threads).to_pandas() end = time.time() start1 = time.time() data = gr.DataFrameWeld(data) data = data[data['age'] > 45] mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean') ratings_by_title = data.groupby('title').size() active_titles = ratings_by_title.index[ratings_by_title >= 250] mean_ratings = mean_ratings.loc[active_titles] mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] sorted_by_diff = mean_ratings.sort_values(by='diff') rating_std_by_title = data.groupby('title')['rating'].std() rating_std_by_title = rating_std_by_title.loc[active_titles] rating_std_by_title = rating_std_by_title.sort_values( ascending=False)[0:10] sorted_by_diff, rating_std_by_title = gr.group_eval( [sorted_by_diff, rating_std_by_title]) end1 = time.time() print "Time to merge:", (end - start) print "Time for analysis:", (end1 - start1) print "Total:", end1 - start return rating_std_by_title
def test_groupby_sum(self): # test single column df = pd.DataFrame({"a": [3, 2, 3], "b": [4, 5, 6]}) input = gr.DataFrameWeld(df) groupby = input.groupby("a").sum().evaluate(False) self.assertItemsEqual([5, 10], groupby.to_pandas()["b"]) # test multi column df = pd.DataFrame({"a": [3, 2, 3], "b": [4, 5, 6], "c": [6, 4, 3]}) input = gr.DataFrameWeld(df) groupby = input.groupby("a").sum().evaluate(False) self.assertItemsEqual([5, 10], groupby.to_pandas()["b"]) self.assertItemsEqual([4, 9], groupby.to_pandas()["c"]) # test multikey single column df = pd.DataFrame({"a": [3, 2, 3], "b": [2, 3, 2], "c": [6, 5, 4]}) input = gr.DataFrameWeld(df) groupby = input.groupby(["a", "b"]).sum().evaluate(False) self.assertItemsEqual([5, 10], groupby.to_pandas()["c"]) # test multikey multi column df = pd.DataFrame({ "a": [3, 2, 3], "b": [2, 3, 2], "c": [6, 5, 4], "d": [6, 4, 3] }) input = gr.DataFrameWeld(df) groupby = input.groupby(["a", "b"]).sum().evaluate(False) self.assertItemsEqual([5, 10], groupby.to_pandas()["c"])
def run(filename, threads): years = range(1880, 2011) columns = ['year', 'sex', 'name', 'births'] sys.stdout.write("Reading data...") sys.stdout.flush() names = pd.read_csv(filename, names=columns) print("done.") sys.stdout.flush() print("Size of names:", len(names)) # Time preprocessing step # This is a weird effect where the cols are being unnecessarily copied...? start = time.time() grouped = gr.DataFrameWeld(names) end = time.time() print("DF Weld time:", end - start) e2e_start = time.time() grouped = grouped.groupby(['year', 'sex']) top1000 = grouped.apply(get_top1000) # Drop the group index, not needed top1000.reset_index(inplace=True, drop=True) top1000 = top1000.evaluate(True, workers=threads).to_pandas() # result = analyze(top1000) e2e_end = time.time() print("Total time:", e2e_end - e2e_start) print(top1000['births'].sum())
def gen_data(size): total_population = np.ones(size, dtype="float64") * 500000 adult_population = np.ones(size, dtype="float64") * 250000 num_robberies = np.ones(size, dtype="float64") * 1000 return gr.DataFrameWeld( pd.DataFrame(data= { "Total population": total_population, "Total adult population": adult_population, "Number of robberies": num_robberies, }))
def test_groupby_sort(self): # test single column df = pd.DataFrame({ "a": [3, 2, 3, 2, 3, 2, 3, 2], "b": [6, 7, 4, 5, 4, 2, 1, 7], "c": [4, 5, 6, 7, 8, 9, 1, 4] }) input = gr.DataFrameWeld(df) groupby = input.groupby("a").apply( lambda g: g.sort_values(by='b').slice(0, 2)) groupby.reset_index() groupby = groupby.evaluate().to_pandas() mdf = df.groupby("a").apply(lambda g: g.sort_values(by='b')[0:2]) mdf.reset_index(inplace=True, drop=True) self.assertItemsEqual(groupby, mdf)
def analyze(top1000, threads): start1 = time.time() top1000 = gr.DataFrameWeld(top1000) top1000names = top1000['name'] all_names = top1000names.unique() lesley_like = all_names.filter(all_names.lower().contains('lesl')) filtered = top1000.filter(top1000names.isin(lesley_like)) table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum') table = table.div(table.sum(1), axis=0) end1 = time.time() result = table.evaluate(True, workers=threads).to_pandas() return result
pd.options.display.max_rows = 10 unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table('data/ml-1m/users.dat', sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table('data/ml-1m/ratings.dat', sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('data/ml-1m/movies.dat', sep='::', header=None, names=mnames) start = time.time() ratings = gr.DataFrameWeld(ratings) users = gr.DataFrameWeld(users) movies = gr.DataFrameWeld(movies) data = gr.merge(gr.merge(ratings, users), movies).evaluate(True).to_pandas() end = time.time() start1 = time.time() print data data = gr.DataFrameWeld(data) mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean') ratings_by_title = data.groupby('title').size() active_titles = ratings_by_title.index[ratings_by_title >= 250]
#!/usr/bin/python # The usual preamble import pandas as pd import grizzly.grizzly as gr import time # Get data (NYC 311 service request dataset) and start cleanup na_values = ['NO CLUE', 'N/A', '0'] raw_requests = pd.read_csv('data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str}) requests = gr.DataFrameWeld(raw_requests) print "Done reading input file..." start = time.time() # Fix requests with extra digits requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5) # Fix requests with 00000 zipcodes zero_zips = requests['Incident Zip'] == '00000' requests['Incident Zip'][zero_zips] = "nan" # Display unique incident zips again (this time cleaned) print requests['Incident Zip'].unique().evaluate() end = time.time() print "Total end-to-end time, including compilation: %.2f" % (end - start)
#!/usr/bin/python # The usual preamble import numpy as np import grizzly.numpy_weld as npw import pandas as pd import grizzly.grizzly as gr import time # Get data (NYC 311 service request dataset) and start cleanup raw_data = pd.read_csv('data/us_cities_states_counties.csv', delimiter='|') raw_data.dropna(inplace=True) data = gr.DataFrameWeld(raw_data) print "Done reading input file..." start = time.time() # Get all city information with total population greater than 500,000 data_big_cities = data[data["Total population"] > 500000] # Compute "crime index" proportional to # (Total population + 2*(Total adult population) - 2000*(Number of robberies)) / 100000 data_big_cities_stats = data_big_cities[ ["Total population", "Total adult population", "Number of robberies"]].values predictions = npw.dot(data_big_cities_stats, np.array( [1, 2, -2000], dtype=np.int64)) / 100000.0 data_big_cities["Crime index"] = predictions # Aggregate "crime index" scores by state data_big_cities["Crime index"][data_big_cities["Crime index"] >= 0.02] = 0.032 data_big_cities["Crime index"][data_big_cities["Crime index"] < 0.01] = 0.005
# Concatenate everything into a single DataFram names = pd.concat(pieces, ignore_index=True) print "Size of names: %d" % len(names) def get_top1000(group): # Note that there is a slight difference that arises # with the name 'Leslyn', year '25' missing as the ordering # in pandas for rows with the same 'sort' value changes. return group.sort_values(by='births', ascending=False).slice(0, 1000) # Time preprocessing step start0 = time.time() grouped = gr.DataFrameWeld(names).groupby(['year', 'sex']) top1000 = grouped.apply(get_top1000) # Drop the group index, not needed top1000.reset_index(inplace=True, drop=True) top1000 = top1000.evaluate(True, passes=passes).to_pandas() end0 = time.time() start1 = time.time() top1000 = gr.DataFrameWeld(top1000) top1000names = top1000['name'] all_names = top1000names.unique() lesley_like = all_names.filter(all_names.lower().contains('lesl')) filtered = top1000.filter(top1000names.isin(lesley_like)) table = filtered.pivot_table('births', index='year',
def test_filter_self(self): k = np.array(["A", "b", "D", "F", "e", "A", "A", "ac"], dtype=str) s = np.array(["A", "b", "D", "F", "e", "A", "A", "ac"], dtype=str) df = pd.DataFrame({"k": k, "s": s}) grs = gr.DataFrameWeld(df) grs = grs["k"].unique()