def main(): # ``range()`` returns a list ... with memory_usage() as u: x = range(10 * 1000 * 1000) # ... so in our case the memory used by this process will be quite higher # now. print "range(): %s" % (u.rss,) # ``xrange()``, in contrast, returns an *iterator* ... with memory_usage() as u: x = xrange(10 * 1000 * 1000) # ... so now there should be almost no memory increase. print "xrange(): %s" % (u.rss,)
def main(): for x, y in zip(xrange(1000), my_range(1000)): if x != y: raise Exception("Oops: %d is not %d" % (x, y)) # ``my_range()`` returns an iterator, too ... with memory_usage() as u: x = my_range(10 * 1000 * 1000) # ... so now there should be almost no memory increase. print "my_range(): %s" % (u.rss,)
def iterate_on_transactions(training=training, version=1): if version == 1: print("iterate on transactions") path_to_csv = path_to_data + 'transactions.csv' else: print("iterate on transactions_v2") path_to_csv = path_to_data + 'transactions_v2.csv' i = 0 df_iter = pd.read_csv(path_to_csv, low_memory=False, iterator=True, chunksize=transactions_chunk_size) print("starting iteration...") for transactions in df_iter: print("i=" + str(i)) transactions = reformat_transactions(transactions) user_count = Counter(transactions['msno']).most_common() user_count = pd.DataFrame(user_count) user_count.columns = ['msno', 'current_number_of_transactions'] user_count.set_index('msno', inplace=True) training = pd.merge(left=training, right=user_count, how='left', left_index=True, right_index=True) training['current_number_of_transactions'] = training.current_number_of_transactions.apply( lambda x: int(x) if pd.notnull(x) else 0) training["total_number_of_transactions"] += training["current_number_of_transactions"] training.drop(['current_number_of_transactions'], axis=1, inplace=True) print("memory usage of training: ") print(memory_usage(training)) print("memory usage of transactions: ") print(memory_usage(transactions)) i += 1 print("end of iteration...") i = 0 training.reset_index(inplace=True) training_copy = training.copy() df_iter = pd.read_csv(path_to_data + 'transactions.csv', low_memory=False, iterator=True, chunksize=transactions_chunk_size) print("starting iteration, looking for most recent transaction...") for transactions in df_iter: print("i=" + str(i)) reformat_transactions(transactions) recent_transactions = transactions.sort_values(['transaction_date']).groupby('msno').first() recent_transactions.reset_index(inplace=True) temp_training = pd.merge(left=training_copy, right=recent_transactions, how='right', on=['msno'], right_index=True) training = pd.concat((training, temp_training)) training = training.sort_values(['transaction_date']).groupby('msno').first() print("memory usage of training: ") print(memory_usage(training)) print("memory usage of transactions: ") print(memory_usage(transactions)) i += 1 del training_copy i = 0 df_iter = pd.read_csv(path_to_data + 'transactions.csv', low_memory=False, iterator=True, chunksize=transactions_chunk_size) training["price_per_day"] = training["actual_amount_paid"]/(training["payment_plan_days"]+0.01) print("starting iteration, looking for usual price per day...") for transactions in df_iter: print("i=" + str(i)) i += 1 transactions = reformat_transactions(transactions) transactions["current_price_per_day"] = transactions["actual_amount_paid"] / (transactions["payment_plan_days"] + 0.01) transactions = transactions.groupby("msno").sum() columns_to_keep = ["current_price_per_day"] transactions = transactions[columns_to_keep] training = pd.merge(left=training, right=transactions, how='left', left_index=True, right_index=True) training["usual_price_per_day"] += training["current_price_per_day"] training.drop(['current_price_per_day'], axis=1, inplace=True) return training