예제 #1
0
def movielens(users, ratings, movies, threads):
    start = time.time()
    ratings = gr.DataFrameWeld(ratings)
    users = gr.DataFrameWeld(users)
    movies = gr.DataFrameWeld(movies)

    data = gr.merge(gr.merge(ratings, users),
                    movies).evaluate(workers=threads).to_pandas()
    end = time.time()

    start1 = time.time()
    data = gr.DataFrameWeld(data)
    data = data[data['age'] > 45]
    mean_ratings = data.pivot_table('rating',
                                    index='title',
                                    columns='gender',
                                    aggfunc='mean')

    ratings_by_title = data.groupby('title').size()
    active_titles = ratings_by_title.index[ratings_by_title >= 250]
    mean_ratings = mean_ratings.loc[active_titles]
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    sorted_by_diff = mean_ratings.sort_values(by='diff')
    rating_std_by_title = data.groupby('title')['rating'].std()
    rating_std_by_title = rating_std_by_title.loc[active_titles]
    rating_std_by_title = rating_std_by_title.sort_values(
        ascending=False)[0:10]
    sorted_by_diff, rating_std_by_title = gr.group_eval(
        [sorted_by_diff, rating_std_by_title])
    end1 = time.time()

    print "Time to merge:", (end - start)
    print "Time for analysis:", (end1 - start1)
    print "Total:", end1 - start
    return rating_std_by_title
    def test_groupby_sum(self):
        # test single column
        df = pd.DataFrame({"a": [3, 2, 3], "b": [4, 5, 6]})
        input = gr.DataFrameWeld(df)
        groupby = input.groupby("a").sum().evaluate(False)
        self.assertItemsEqual([5, 10], groupby.to_pandas()["b"])

        # test multi column
        df = pd.DataFrame({"a": [3, 2, 3], "b": [4, 5, 6], "c": [6, 4, 3]})
        input = gr.DataFrameWeld(df)
        groupby = input.groupby("a").sum().evaluate(False)
        self.assertItemsEqual([5, 10], groupby.to_pandas()["b"])
        self.assertItemsEqual([4, 9], groupby.to_pandas()["c"])

        # test multikey single column
        df = pd.DataFrame({"a": [3, 2, 3], "b": [2, 3, 2], "c": [6, 5, 4]})
        input = gr.DataFrameWeld(df)
        groupby = input.groupby(["a", "b"]).sum().evaluate(False)
        self.assertItemsEqual([5, 10], groupby.to_pandas()["c"])

        # test multikey multi column
        df = pd.DataFrame({
            "a": [3, 2, 3],
            "b": [2, 3, 2],
            "c": [6, 5, 4],
            "d": [6, 4, 3]
        })
        input = gr.DataFrameWeld(df)
        groupby = input.groupby(["a", "b"]).sum().evaluate(False)
        self.assertItemsEqual([5, 10], groupby.to_pandas()["c"])
예제 #3
0
def run(filename, threads):
    years = range(1880, 2011)
    columns = ['year', 'sex', 'name', 'births']

    sys.stdout.write("Reading data...")
    sys.stdout.flush()
    names = pd.read_csv(filename, names=columns)
    print("done.")
    sys.stdout.flush()
    print("Size of names:", len(names))

    # Time preprocessing step
    # This is a weird effect where the cols are being unnecessarily copied...?
    start = time.time()
    grouped = gr.DataFrameWeld(names)
    end = time.time()
    print("DF Weld time:", end - start)

    e2e_start = time.time()
    grouped = grouped.groupby(['year', 'sex'])
    top1000 = grouped.apply(get_top1000)
    # Drop the group index, not needed
    top1000.reset_index(inplace=True, drop=True)
    top1000 = top1000.evaluate(True, workers=threads).to_pandas()

    # result = analyze(top1000)

    e2e_end = time.time()
    print("Total time:", e2e_end - e2e_start)

    print(top1000['births'].sum())
예제 #4
0
def gen_data(size):
    total_population = np.ones(size, dtype="float64") * 500000
    adult_population = np.ones(size, dtype="float64") * 250000
    num_robberies = np.ones(size, dtype="float64") * 1000
    return gr.DataFrameWeld(
            pd.DataFrame(data= {
                "Total population": total_population,
                "Total adult population": adult_population,
                "Number of robberies": num_robberies,
                }))
    def test_groupby_sort(self):
        # test single column
        df = pd.DataFrame({
            "a": [3, 2, 3, 2, 3, 2, 3, 2],
            "b": [6, 7, 4, 5, 4, 2, 1, 7],
            "c": [4, 5, 6, 7, 8, 9, 1, 4]
        })

        input = gr.DataFrameWeld(df)
        groupby = input.groupby("a").apply(
            lambda g: g.sort_values(by='b').slice(0, 2))
        groupby.reset_index()
        groupby = groupby.evaluate().to_pandas()
        mdf = df.groupby("a").apply(lambda g: g.sort_values(by='b')[0:2])
        mdf.reset_index(inplace=True, drop=True)
        self.assertItemsEqual(groupby, mdf)
예제 #6
0
def analyze(top1000, threads):
    start1 = time.time()
    top1000 = gr.DataFrameWeld(top1000)
    top1000names = top1000['name']
    all_names = top1000names.unique()
    lesley_like = all_names.filter(all_names.lower().contains('lesl'))

    filtered = top1000.filter(top1000names.isin(lesley_like))
    table = filtered.pivot_table('births',
                                 index='year',
                                 columns='sex',
                                 aggfunc='sum')

    table = table.div(table.sum(1), axis=0)
    end1 = time.time()
    result = table.evaluate(True, workers=threads).to_pandas()
    return result
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('data/ml-1m/users.dat', sep='::', header=None,
                      names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('data/ml-1m/ratings.dat', sep='::', header=None,
                        names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/ml-1m/movies.dat', sep='::', header=None,
                       names=mnames)

start = time.time()
ratings = gr.DataFrameWeld(ratings)
users = gr.DataFrameWeld(users)
movies = gr.DataFrameWeld(movies)

data = gr.merge(gr.merge(ratings, users), movies).evaluate(True).to_pandas()
end = time.time()

start1 = time.time()
print data
data = gr.DataFrameWeld(data)
mean_ratings = data.pivot_table('rating', index='title', columns='gender',
                                aggfunc='mean')


ratings_by_title = data.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
#!/usr/bin/python

# The usual preamble
import pandas as pd
import grizzly.grizzly as gr
import time

# Get data (NYC 311 service request dataset) and start cleanup
na_values = ['NO CLUE', 'N/A', '0']
raw_requests = pd.read_csv('data/311-service-requests.csv',
                           na_values=na_values,
                           dtype={'Incident Zip': str})
requests = gr.DataFrameWeld(raw_requests)
print "Done reading input file..."

start = time.time()

# Fix requests with extra digits
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

# Fix requests with 00000 zipcodes
zero_zips = requests['Incident Zip'] == '00000'
requests['Incident Zip'][zero_zips] = "nan"

# Display unique incident zips again (this time cleaned)
print requests['Incident Zip'].unique().evaluate()
end = time.time()

print "Total end-to-end time, including compilation: %.2f" % (end - start)
예제 #9
0
#!/usr/bin/python

# The usual preamble
import numpy as np
import grizzly.numpy_weld as npw
import pandas as pd
import grizzly.grizzly as gr
import time

# Get data (NYC 311 service request dataset) and start cleanup
raw_data = pd.read_csv('data/us_cities_states_counties.csv', delimiter='|')
raw_data.dropna(inplace=True)
data = gr.DataFrameWeld(raw_data)
print "Done reading input file..."

start = time.time()

# Get all city information with total population greater than 500,000
data_big_cities = data[data["Total population"] > 500000]

# Compute "crime index" proportional to
# (Total population + 2*(Total adult population) - 2000*(Number of robberies)) / 100000
data_big_cities_stats = data_big_cities[
    ["Total population", "Total adult population", "Number of robberies"]].values
predictions = npw.dot(data_big_cities_stats, np.array(
    [1, 2, -2000], dtype=np.int64)) / 100000.0
data_big_cities["Crime index"] = predictions

# Aggregate "crime index" scores by state
data_big_cities["Crime index"][data_big_cities["Crime index"] >= 0.02] = 0.032
data_big_cities["Crime index"][data_big_cities["Crime index"] < 0.01] = 0.005
# Concatenate everything into a single DataFram
names = pd.concat(pieces, ignore_index=True)
print "Size of names: %d" % len(names)


def get_top1000(group):
    # Note that there is a slight difference that arises
    # with the name 'Leslyn', year '25' missing as the ordering
    # in pandas for rows with the same 'sort' value changes.
    return group.sort_values(by='births', ascending=False).slice(0, 1000)


# Time preprocessing step
start0 = time.time()
grouped = gr.DataFrameWeld(names).groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
# Drop the group index, not needed
top1000.reset_index(inplace=True, drop=True)
top1000 = top1000.evaluate(True, passes=passes).to_pandas()
end0 = time.time()

start1 = time.time()
top1000 = gr.DataFrameWeld(top1000)
top1000names = top1000['name']
all_names = top1000names.unique()
lesley_like = all_names.filter(all_names.lower().contains('lesl'))

filtered = top1000.filter(top1000names.isin(lesley_like))
table = filtered.pivot_table('births',
                             index='year',
 def test_filter_self(self):
     k = np.array(["A", "b", "D", "F", "e", "A", "A", "ac"], dtype=str)
     s = np.array(["A", "b", "D", "F", "e", "A", "A", "ac"], dtype=str)
     df = pd.DataFrame({"k": k, "s": s})
     grs = gr.DataFrameWeld(df)
     grs = grs["k"].unique()