def print_stages(test_y, stage_predictions, test_metric): if hasattr(ml_metrics, test_metric): eval_metric = getattr(ml_metrics, test_metric) else: eval_metric = getattr(skMetrics, test_metric) count = 0 iters = [] loss = [] for prediction in stage_predictions: count += 1 if count in [1, 5, 10, 30] or count % 50 == 0: iters.append(count) loss.append(eval_metric(test_y, prediction)) loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss}) loss_df.rename(columns={'Loss': test_metric}, inplace=True) pd.set_printoptions(max_columns=len(loss_df.columns), max_rows=len(loss_df)) print("Loss:") print(loss_df)
def print_stages(test_y, stage_predictions, test_metric): if hasattr(ml_metrics, test_metric): eval_metric = getattr(ml_metrics, test_metric) else: eval_metric = getattr(skMetrics, test_metric) count = 0 iters = [] loss = [] for prediction in stage_predictions: count += 1 if count in [1, 5, 10, 30] or count % 50 == 0: iters.append(count) loss.append(eval_metric(test_y, prediction)) loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss}) loss_df.rename(columns={'Loss': test_metric}, inplace=True) pd.set_printoptions(max_columns=len(loss_df.columns), max_rows=len(loss_df)) print("Loss:") print(loss_df)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.stats as stats from credit_card_data import read_data pd.set_printoptions(max_rows=50, max_columns=21) def make_hist(frame, to_bin, norm=False): fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot("111") ylab = 'Probability' if norm else 'Count' xlab = frame[to_bin].name ax.set_ylabel(ylab, fontsize=12, color='blue') ax.set_xlabel(xlab, fontsize=12, color='blue') ticks = range(24) # military times labels = map(lambda x: str("%s:00" % x), ticks) plt.xticks(ticks, labels) if not norm: ax.set_ylim(0, 8000) for tick in plt.gca().xaxis.iter_ticks(): tick[0].label2On = True tick[0].label1On = False tick[0].label2.set_rotation('vertical')
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") import numpy as np from sys import maxsize pd.set_printoptions(threshold=maxsize) print('#', '~' * 15, 'Load Data', '~' * 50) od_one = pd.read_table( "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatest.txt", sep=',') od_two = pd.read_table( "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatraining.txt", sep=',') od_three = pd.read_table( "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatest2.txt", sep=',') print(list(od_one)) print(list(od_two)) print(list(od_three)) #Combine datasets od_full = pd.concat([od_one, od_two, od_three]) #Split the DateTime feature to separate columns od_full[['date', 'time']] = od_full.date.apply(lambda x: pd.Series(str(x).split(" "))) od_full.head()
import pandas as pd from datetime import datetime from pandas.io.data import DataReader pd.set_printoptions(max_rows=2000) aapl = DataReader("AAPL", "yahoo", datetime(2010, 1, 1), datetime(2012, 10, 30)) returns = aapl.pct_change() def f(x): if x > 0.01: return 1 elif x < -0.01: return -1 else: return 0 frame = returns.applymap(f) frame['UP'] = frame['Adj Close'] == 1 frame['DOWN'] = frame['Adj Close'] == -1 frame['NONE'] = frame['Adj Close'] ==0 from lxml import etree import datetime from dateutil.parser import parse path = "http://www.google.com/finance/company_news?q=NASDAQ:AAPL&output=rss&num=500" root = etree.parse(path) myRoot = root.getroot() news={}
# Must have xlrd module installed for ExcelFile to work import pandas as pandas import matplotlib.pyplot as plt import scikits.statsmodels.tools.tools as tools import numpy as np from matplotlib.ticker import MultipleLocator # Open the Excel file xls = pandas.ExcelFile(r'D:\tomcat\webapps\DensoOBD\files\35400\08-12M HAM Door Jamb Switch.xls') # Parse the specific Excel sheet name df = xls.parse('Claims') pandas.set_printoptions(precision=6, max_columns=12) data = df[(df['MODEL_YEAR']==2009) & (df['FACTORY_CODE']=='ELP') & (df['MODEL_NAME']=='CRV')] counts = data.groupby(['RO_DEALER_STATE'])['VIN'].count() qty = counts.values state = counts.index # zip the qty and state lists together so we can sort it by qty data_list_numeric = zip(qty, state) data_list_numeric.sort() data_list_numeric.reverse() xlabels = [] yvalues = []
import pandas as pd from pandas.io.data import DataReader from datetime import datetime import sqlite3 import pandas.io.sql as psql import csv import timeit from django.utils.encoding import force_unicode pd.set_printoptions(max_colwidth = 400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES) #get rows with polling in text sqlPolling = 'SELECT [twitter.text], Newsworthy \ FROM HT_Annotated WHERE ([twitter.text] LIKE "% polling %" )' polling = psql.read_frame(sqlPolling , con) polling.head() polling = pd.DataFrame(polling) #Get rows with wordcount > 12 func_wordcounts = lambda x: len(x.strip().split(" "))
import sys import pandas as pd data = pd.read_csv(sys.argv[1]) numbers = data.dtypes[data.dtypes == float].index groups = data.dtypes[data.dtypes != float].index pd.set_printoptions(max_columns=100, max_rows=5000) for group in groups: ave = data.groupby(group).mean() ave['#'] = data[group].value_counts() for number in numbers: print '\n%s by %s' % (number, group) ave = ave[ave[number].notnull() & (ave['#'] > 10)] print ave.sort(number, ascending=False)
from __future__ import division from copy import copy import itertools import glob import os import unittest import pymc as pm import numpy as np import pandas as pd import nose pd.set_printoptions(precision=4) from nose import SkipTest import hddm from hddm.diag import check_model def add_outliers(data, p_outlier): """add outliers to data. half of the outliers will be fast, and the rest will be slow Input: data - data p_outliers - probability of outliers """ data = pd.DataFrame(data) #generating outliers n_outliers = int(len(data) * p_outlier) outliers = data[:n_outliers].copy() #fast outliers
# For a given entry in the index, identify which week it belongs to (multiple of # 7). which_week = lambda x: (x - data_df.index[0]).days / 7 weekly_group = data_df.groupby(which_week) print "Weekly data:", weekly_group.mean() print # 9. Plots monthly_data = monthly_group.mean() from matplotlib import pyplot monthly_data.plot() # Force this plot to happen in a separate figure pyplot.figure() monthly_data.boxplot() pyplot.show() # 10. This is just another way to group records: unique_monthly_grouped = data_df.groupby(lambda d: (d.month, d.year)) print '10. Mean wind speed for each month in each location' print unique_monthly_grouped.mean() print # 11. Weekly stats over the first year first_year = data_df.ix[:52 * 7, :] weekly_first_year = first_year.groupby(which_week) stats = weekly_first_year.apply(lambda x: x.describe()) import pandas pandas.set_printoptions(max_rows=500, max_columns=15, notebook_repr_html=False) print stats
def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file, model_type, model_file, fit_args, test_metric, na_fill_value, silent): start = timeit.default_timer() train_x = pd.read_csv(train_data_file) mappings = dict() for col in train_x.columns: if train_x[col].dtype == np.dtype('object'): s = np.unique(train_x[col].fillna(na_fill_value).values) mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s) train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value) else: train_x[col] = train_x[col].fillna(na_fill_value) train_y = train_x[target_col] del train_x[target_col] x_cols = train_x.columns feat_importance_fun = lambda (fitted_model): fitted_model.feature_importances_ staged_predict = lambda (fitted_model, pred_x): [fitted_model.predict(pred_x)] predict = lambda (fitted_model, pred_x): fitted_model.predict(pred_x) model = None if model_type == "RandomForestRegressor": model = RandomForestRegressor(**fit_args) model.fit(X=train_x, y=train_y) elif model_type == "GradientBoostingRegressor": model = GradientBoostingRegressor(**fit_args) model.fit(X=train_x, y=train_y) staged_predict = lambda (fitted_model, pred_x): fitted_model.staged_predict(pred_x) save_model(model=model, model_file=model_file) del train_x, train_y test_x = pd.read_csv(test_data_file) for col in test_x.columns: if col in mappings: test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value) else: test_x[col] = test_x[col].fillna(na_fill_value) test_y = None if target_col in test_x.columns: test_y = test_x[target_col][test_x[target_col] != na_fill_value] if len(test_y) != len(test_x): test_y = None test_x = test_x[x_cols] test_pred = pd.DataFrame({'pred': predict((model, test_x))}) if not silent and test_y is not None: print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)), test_metric=test_metric) if not silent: feat_importance = feat_importance_fun(model) if feat_importance is not None: feat_importance = pd.DataFrame({'Features': x_cols, 'Importance': feat_importance}) pd.set_printoptions(max_columns=len(test_x.columns), max_rows=len(test_x.columns)) print("Feature importances:") feat_importance.sort(columns='Importance', ascending=False, inplace=True) feat_importance.index = range(1, len(feat_importance) + 1) print(feat_importance) test_pred.to_csv(test_pred_file, index=False) stop = timeit.default_timer() if not silent: print "Time: %d s" % (stop - start)
import pandas as pd from datetime import datetime from pandas.io.data import DataReader pd.set_printoptions(max_rows=2000) aapl = DataReader("AAPL", "yahoo", datetime(2010, 1, 1), datetime(2012, 10, 30)) returns = aapl.pct_change() def f(x): if x > 0.01: return 1 elif x < -0.01: return -1 else: return 0 frame = returns.applymap(f) frame['UP'] = frame['Adj Close'] == 1 frame['DOWN'] = frame['Adj Close'] == -1 frame['NONE'] = frame['Adj Close'] == 0 from lxml import etree import datetime from dateutil.parser import parse path = "http://www.google.com/finance/company_news?q=NASDAQ:AAPL&output=rss&num=500" root = etree.parse(path)
from pandas.io.data import DataReader from datetime import datetime from pandas import * import pandas pandas.set_printoptions(max_rows=2000, max_columns=14) #from talib import abstract import talib #msft = DataReader("MSFT", "yahoo") #msft = DataReader("MSFT", "yahoo", datetime(2009,1,1)) #print msft["Volume"] #print msft["Adj Close"][-100:] #print msft vticker = "GOOG" goog = DataReader(vticker, "yahoo", datetime(2009,1,1)) #print goog["Close"] #print goog #goog2 = goog["Close"].shift(1, freq=datetools.bday) #goog2 = goog + goog #print goog2.index #pandas.set_printoptions(max_rows=2000, max_columns=10) ## Calculate NDC and Previous closes goog["NDC"] = goog["Close"].shift(-1) goog["close-1"] = goog["Close"].shift(1) goog["close-2"] = goog["Close"].shift(2) goog["close-3"] = goog["Close"].shift(3) goog["close-4"] = goog["Close"].shift(4) goog["close-5"] = goog["Close"].shift(5)
def crawlRawData(lXall): """ crawling raw data """ print("Crawling raw data...") basedir='../stumbled_upon/raw_content/' pfacebook = re.compile("www.{1,2}facebook.{1,2}com") pfacebook2 = re.compile("developers.{1,2}facebook.{1,2}com.{1,2}docs.{1,2}reference.{1,2}plugins.{1,2}like|facebook.{1,2}com.{1,2}plugins.{1,2}like") plinkedin = re.compile("platform.{1,2}linkedin.{1,2}com") ptwitter = re.compile("twitter.{1,2}com.{1,2}share") prss=re.compile("rss feed",re.IGNORECASE) pgooglep=re.compile("apis.{1,2}google.{1,2}com") #pstumble=re.compile("www.{1,2}stumbleupon.{1,2}com") pstumble=re.compile("stumbleupon") pcolor=re.compile("colorscheme|color_scheme|color=|color:",re.IGNORECASE) psignup=re.compile("signup|register|login|sign up",re.IGNORECASE) pcomment=re.compile("leave a comment|leave comment",re.IGNORECASE) pncomment=re.compile("comment-",re.IGNORECASE) pmail=re.compile("email",re.IGNORECASE) ppics=re.compile("\.png|\.tif|\.jpg",re.IGNORECASE) pgif=re.compile("\.gif",re.IGNORECASE) psmile=re.compile(":-\)|;-\)") plbreak=re.compile("<br>") psearch=re.compile("searchstring|customsearch|searchcontrol|searchquery|searchform|searchbox",re.IGNORECASE) pcaptcha=re.compile("captcha",re.IGNORECASE) padvert=re.compile("advertis",re.IGNORECASE) pnewline=re.compile("\n") pgooglead=re.compile("google_ad_client") phtml5=re.compile("html5",re.IGNORECASE) phuff=re.compile("www.huffingtonpost.com",re.IGNORECASE) pflash=re.compile("shockwave-flash",re.IGNORECASE) pdynlink=re.compile("<a href.+?.+>") pnofollow=re.compile("rel=\"nofollow\"",re.IGNORECASE) pschemaorg=re.compile("schema\.org",re.IGNORECASE) pmobileredirect=re.compile("mobile redirect",re.IGNORECASE) #pshare=re.compile("sharearticle|share.{1,20}article",re.IGNORECASE) plang=re.compile("en-US|en_US",re.IGNORECASE) tutto=[] for ind in lXall.index: row=[] nl=1.0+lXall.ix[ind,'numberOfLinks'] nchar=1.0+lXall.ix[ind,'non_markup_alphanum_characters'] #print "numberOfLinks:",nl with open(basedir+str(ind), 'r') as content_file: content = content_file.read() #print "id:",ind, row.append(ind) res = pfacebook.findall(content) row.append(len(res)/float(nl)) res = pfacebook2.findall(content) row.append(len(res)/float(nl)) res = ptwitter.findall(content) row.append(len(res)/float(nl)) #res = prss.findall(content) #row.append(len(res)/float(nl)) #res = pgooglep.findall(content) #row.append(len(res)/float(nl)) #res = pstumble.findall(content) #row.append(len(res)/float(nl)) res = pncomment.findall(content) row.append(len(res)) #res = pcolor.findall(content) #row.append(len(res)) #res = psmile.findall(content) #row.append(len(res)) #if len(res)>0: #print ind,": ",res #raw_input("HITKEY") #res = plbreak.findall(content) #row.append(len(res)) #res = padvert.findall(content) #row.append(len(res)) res = pnewline.findall(content) row.append(math.log(1.0+len(res))) #res = pdynlink.findall(content) #row.append(len(res)) #res = pnofollow.findall(content) #row.append(len(res)) #res = pschemaorg.findall(content) #row.append(len(res)) #res = pmobileredirect.findall(content) #row.append(len(res)) #m = pgooglead.search(content) #if m: # row.append(1) # else: # row.append(0) #if len(res)>0: #print ind,": ",res #raw_input("HITKEY") #res = pshare.findall(content) #row.append(len(res)/float(nl)) #print "" tutto.append(row) newdf=pd.DataFrame(tutto).set_index(0) newdf.columns=['wwwfacebook_ratio','facebooklike_ratio','twitter_ratio','n_comment','logn_newline'] pd.set_printoptions(max_rows=40, max_columns=20) print(newdf.head(20)) print(newdf.describe()) return newdf
def crawlRawData(lXall): """ crawling raw data """ print "Crawling raw data..." basedir='../stumbled_upon/raw_content/' pfacebook = re.compile("www.{1,2}facebook.{1,2}com") pfacebook2 = re.compile("developers.{1,2}facebook.{1,2}com.{1,2}docs.{1,2}reference.{1,2}plugins.{1,2}like|facebook.{1,2}com.{1,2}plugins.{1,2}like") plinkedin = re.compile("platform.{1,2}linkedin.{1,2}com") ptwitter = re.compile("twitter.{1,2}com.{1,2}share") prss=re.compile("rss feed",re.IGNORECASE) pgooglep=re.compile("apis.{1,2}google.{1,2}com") #pstumble=re.compile("www.{1,2}stumbleupon.{1,2}com") pstumble=re.compile("stumbleupon") pcolor=re.compile("colorscheme|color_scheme|color=|color:",re.IGNORECASE) psignup=re.compile("signup|register|login|sign up",re.IGNORECASE) pcomment=re.compile("leave a comment|leave comment",re.IGNORECASE) pncomment=re.compile("comment-",re.IGNORECASE) pmail=re.compile("email",re.IGNORECASE) ppics=re.compile("\.png|\.tif|\.jpg",re.IGNORECASE) pgif=re.compile("\.gif",re.IGNORECASE) psmile=re.compile(":-\)|;-\)") plbreak=re.compile("<br>") psearch=re.compile("searchstring|customsearch|searchcontrol|searchquery|searchform|searchbox",re.IGNORECASE) pcaptcha=re.compile("captcha",re.IGNORECASE) padvert=re.compile("advertis",re.IGNORECASE) pnewline=re.compile("\n") pgooglead=re.compile("google_ad_client") phtml5=re.compile("html5",re.IGNORECASE) phuff=re.compile("www.huffingtonpost.com",re.IGNORECASE) pflash=re.compile("shockwave-flash",re.IGNORECASE) pdynlink=re.compile("<a href.+?.+>") pnofollow=re.compile("rel=\"nofollow\"",re.IGNORECASE) pschemaorg=re.compile("schema\.org",re.IGNORECASE) pmobileredirect=re.compile("mobile redirect",re.IGNORECASE) #pshare=re.compile("sharearticle|share.{1,20}article",re.IGNORECASE) plang=re.compile("en-US|en_US",re.IGNORECASE) tutto=[] for ind in lXall.index: row=[] nl=1.0+lXall.ix[ind,'numberOfLinks'] nchar=1.0+lXall.ix[ind,'non_markup_alphanum_characters'] #print "numberOfLinks:",nl with open(basedir+str(ind), 'r') as content_file: content = content_file.read() #print "id:",ind, row.append(ind) res = pfacebook.findall(content) row.append(len(res)/float(nl)) res = pfacebook2.findall(content) row.append(len(res)/float(nl)) res = ptwitter.findall(content) row.append(len(res)/float(nl)) #res = prss.findall(content) #row.append(len(res)/float(nl)) #res = pgooglep.findall(content) #row.append(len(res)/float(nl)) #res = pstumble.findall(content) #row.append(len(res)/float(nl)) res = pncomment.findall(content) row.append(len(res)) #res = pcolor.findall(content) #row.append(len(res)) #res = psmile.findall(content) #row.append(len(res)) #if len(res)>0: #print ind,": ",res #raw_input("HITKEY") #res = plbreak.findall(content) #row.append(len(res)) #res = padvert.findall(content) #row.append(len(res)) res = pnewline.findall(content) row.append(math.log(1.0+len(res))) #res = pdynlink.findall(content) #row.append(len(res)) #res = pnofollow.findall(content) #row.append(len(res)) #res = pschemaorg.findall(content) #row.append(len(res)) #res = pmobileredirect.findall(content) #row.append(len(res)) #m = pgooglead.search(content) #if m: # row.append(1) # else: # row.append(0) #if len(res)>0: #print ind,": ",res #raw_input("HITKEY") #res = pshare.findall(content) #row.append(len(res)/float(nl)) #print "" tutto.append(row) newdf=pd.DataFrame(tutto).set_index(0) newdf.columns=['wwwfacebook_ratio','facebooklike_ratio','twitter_ratio','n_comment','logn_newline'] pd.set_printoptions(max_rows=40, max_columns=20) print newdf.head(20) print newdf.describe() return newdf
# <codecell> # a summary of the data is printed when there's "too much" to display print(df) # <codecell> # in this "too much" case, use "columns" and "values" # to peak at first 3 rows of data print(df.columns) print(df[0:3].values) # <codecell> # or better yet, change what "too much" means pd.set_printoptions(max_columns=27) print(df[0:3]) # <markdowncell> # Now for the subsetting features... # <codecell> # create a data frame consisting of a subset of columns dfsubsetcols = df[['W', 'Q2', 'xsect', 'err']] print(dfsubsetcols) # <codecell> dfsubsetrows = df[0:3]
def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file, model_type, model_file, fit_args, test_metric, na_fill_value, silent): start = timeit.default_timer() train_x = pd.read_csv(train_data_file) mappings = dict() for col in train_x.columns: if train_x[col].dtype == np.dtype('object'): s = np.unique(train_x[col].fillna(na_fill_value).values) mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s) train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value) else: train_x[col] = train_x[col].fillna(na_fill_value) train_y = train_x[target_col] del train_x[target_col] x_cols = train_x.columns feat_importance_fun = lambda (fitted_model): fitted_model.feature_importances_ staged_predict = lambda (fitted_model, pred_x): [fitted_model.predict(pred_x)] predict = lambda (fitted_model, pred_x): fitted_model.predict(pred_x) model = None if model_type == "RandomForestRegressor": model = RandomForestRegressor(**fit_args) model.fit(X=train_x, y=train_y) elif model_type == "GradientBoostingRegressor": model = GradientBoostingRegressor(**fit_args) model.fit(X=train_x, y=train_y) staged_predict = lambda (fitted_model, pred_x): fitted_model.staged_predict(pred_x) save_model(model=model, model_file=model_file) del train_x, train_y test_x = pd.read_csv(test_data_file) for col in test_x.columns: if col in mappings: test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value) else: test_x[col] = test_x[col].fillna(na_fill_value) test_y = None if target_col in test_x.columns: test_y = test_x[target_col][test_x[target_col] != na_fill_value] if len(test_y) != len(test_x): test_y = None test_x = test_x[x_cols] test_pred = pd.DataFrame({'pred': predict((model, test_x))}) if not silent and test_y is not None: print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)), test_metric=test_metric) if not silent: feat_importance = feat_importance_fun(model) if feat_importance is not None: feat_importance = pd.DataFrame({'Features': x_cols, 'Importance': feat_importance}) pd.set_printoptions(max_columns=len(test_x.columns), max_rows=len(test_x.columns)) print("Feature importances:") feat_importance.sort(columns='Importance', ascending=False, inplace=True) feat_importance.index = range(1, len(feat_importance) + 1) print(feat_importance) test_pred.to_csv(test_pred_file, index=False) stop = timeit.default_timer() if not silent: print "Time: %d s" % (stop - start)
from __future__ import division from copy import copy import itertools import kabuki import os import unittest import pymc as pm import numpy as np import pandas as pd import nose pd.set_printoptions(precision=4) from nose import SkipTest import hddm from hddm.diag import check_model def diff_model(param, subj=True, num_subjs=10, change=.5, size=500): params_cond_a = {'v':.5, 'a':2., 'z':.5, 't': .3, 'st':0., 'sv':0., 'sz':0.} params_cond_b = copy(params_cond_a) params_cond_b[param] += change params = {'A': params_cond_a, 'B': params_cond_b} data, subj_params = hddm.generate.gen_rand_data(params, subjs=num_subjs, size=size) model = hddm.models.HDDMTruncated(data, depends_on={param:['condition']}, is_group_model=subj) return model class TestMulti(unittest.TestCase):
''' Created on May 2, 2013 @author: phcostello ''' import pandas as pd import numpy as np from pandas.io.data import DataReader from datetime import datetime import sqlite3 import pandas.io.sql as psql import csv import timeit pd.set_printoptions(max_colwidth=400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" def showTables(con, display=True): cur = con.cursor() cur.execute("SELECT name, type FROM sqlite_master") tbls = pd.DataFrame(cur.fetchall()) if (display): print tbls return tbls def readDB(con, table, startDate, endDate, DateField=None, fields=None):
# <codecell> starAlias = np.core.defchararray.replace(starAlias,'#','') # <codecell> starAliasUnique = [] maxLen = 0 for i,thisStar in enumerate(np.unique(starAlias[:,0])): x = starAlias[np.where(thisStar==starAlias)[0],1] thisStarAliases = ", ".join(x) starAliasUnique.append([thisStar,thisStarAliases]) #Get max len of alias column if len(thisStarAliases)>maxLen: maxLen =len(thisStarAliases) print maxLen, 'is the length of the longest column' starAliasUnique = np.array(starAliasUnique) # <codecell> labels = ['Name','Alias'] a = pd.DataFrame(starAliasUnique, dtype=str) a.columns = labels pd.set_printoptions(max_colwidth=maxLen) print a.to_latex(index=False)
import sys from scipy.stats.mstats import f_oneway import pandas as pd data = pd.read_csv(sys.argv[1]) numbers = data.dtypes[data.dtypes == float].index groups = data.dtypes[data.dtypes != float].index pd.set_printoptions(max_columns=100) results = [] for group in groups: grouped = data.groupby(group) ave = grouped.mean() ave['#'] = data[group].value_counts() for number in numbers: F, prob = f_oneway(*grouped[number].values) improvement = ave[number].max() / data[number].mean() - 1 if prob < .05: ave = ave[ave[number].notnull() & (ave['#'] > 10)] results.append([group, number, improvement, prob, ave.sort(number, ascending=False)]) for group, number, improvement, prob, ave in sorted(results, key=lambda v: v[2], reverse=True): print '\n%s by %s: %0.1f%% (%0.3f)' % (number, group, 100 * improvement, prob) # print '-' * 80 # print ave.head(10) # if len(ave) > 20: # print ave.tail(10)
# IPython log file get_ipython().magic(u'pinfo %logstart') import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import statsmodels.api as sm import pandas as pd from pandas.tools.plotting import scatter_matrix # need to manually set the print options in order to get data to display pd.set_printoptions(max_columns=99) # load the data using same command as last time mrw_1992_data = sm.iolib.foreign.genfromdta('mrw1992.dta', missing_flt=NaN, missing_str=NaN, pandas=True) print type(mrw_1992_data) # view the entire dataset mrw_1992_data #[Out]# c_index c_name c_code cont nonoil inter oecd gdp60 gdp85 popgrowth igdp school #[Out]# 0 1 Algeria DZA 1 1 1 0 2485 4371 2.6 24.100000 4.5 #[Out]# 1 2 Angola AGO 1 1 0 0 1588 1171 2.1 5.800000 1.8 #[Out]# 2 3 Benin BEN 1 1 0 0 1116 1071 2.4 10.800000 1.8 #[Out]# 3 4 Botswana BWA 1 1 1 0 959 3671 3.2 28.299999 2.9 #[Out]# 4 5 Burkina Faso BFA 1 1 0 0 529 857 0.9 12.700000 0.4 #[Out]# 5 6 Burundi BDI 1 1 0 0 755 663 1.7 5.100000 0.4 #[Out]# 6 7 Cameroon CMR 1 1 1 0 889 2190 2.1 12.800000 3.4 #[Out]# 7 8 Central Afr. Rep. CAF 1 1 0 0 838 789 1.7 10.500000 1.4 #[Out]# 8 9 Chad TCD 1 1 0 0 908 462 1.9 6.900000 0.4 #[Out]# 9 10 PR Congo RCB 1 1 0 0 1009 2624 2.4 28.799999 3.8 #[Out]# 10 11 Egypt EGY 1 1 0 0 907 2160 2.5 16.299999 7.0
grouped.mean() # <headingcell level=4> # Time-Series # <codecell> dates = pd.date_range('1/1/2000', periods=50) df = pd.DataFrame(np.random.randn(50, 4), index=dates, columns=['A', 'B', 'C', 'D']) # <codecell> pd.set_printoptions(max_rows=49) # <codecell> df # <codecell> df.head(5) # <codecell> df.ix["2000-1-5":"2000-1-15"] # <codecell>
# IPython log file get_ipython().magic(u'pinfo %logstart') import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import statsmodels.api as sm import pandas as pd from pandas.tools.plotting import scatter_matrix # need to manually set the print options in order to get data to display pd.set_printoptions(max_columns=99) # load the data using same command as last time mrw_1992_data = sm.iolib.foreign.genfromdta('mrw1992.dta', missing_flt=NaN, missing_str=NaN, pandas=True) print type(mrw_1992_data) # view the entire dataset mrw_1992_data #[Out]# c_index c_name c_code cont nonoil inter oecd gdp60 gdp85 popgrowth igdp school #[Out]# 0 1 Algeria DZA 1 1 1 0 2485 4371 2.6 24.100000 4.5 #[Out]# 1 2 Angola AGO 1 1 0 0 1588 1171 2.1 5.800000 1.8 #[Out]# 2 3 Benin BEN 1 1 0 0 1116 1071 2.4 10.800000 1.8 #[Out]# 3 4 Botswana BWA 1 1 1 0 959 3671 3.2 28.299999 2.9 #[Out]# 4 5 Burkina Faso BFA 1 1 0 0 529 857 0.9 12.700000 0.4 #[Out]# 5 6 Burundi BDI 1 1 0 0 755 663 1.7 5.100000 0.4 #[Out]# 6 7 Cameroon CMR 1 1 1 0 889 2190 2.1 12.800000 3.4 #[Out]# 7 8 Central Afr. Rep. CAF 1 1 0 0 838 789 1.7 10.500000 1.4 #[Out]# 8 9 Chad TCD 1 1 0 0 908 462 1.9 6.900000 0.4 #[Out]# 9 10 PR Congo RCB 1 1 0 0 1009 2624 2.4 28.799999 3.8 #[Out]# 10 11 Egypt EGY 1 1 0 0 907 2160 2.5 16.299999 7.0 #[Out]# 11 12 Ethiopia ETH 1 1 1 0 533 608 2.3 5.400000 1.1 #[Out]# 12 13 Gabon GAB 1 0 0 0 1307 5350 1.4 22.100000 2.6 #[Out]# 13 14 Gambia, The GMB 1 0 0 0 799 NaN NaN 18.100000 1.5
# <headingcell level=3> # Tutorial Import Assumptions # <codecell> import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt import pandas from scipy import stats np.set_printoptions(precision=4, suppress=True) pandas.set_printoptions(notebook_repr_html=False, precision=4, max_columns=12) # <headingcell level=3> # Statsmodels Import Convention # <codecell> import statsmodels.api as sm # <markdowncell> # Import convention for models for which a formula is available. # <codecell>
# GitHub link for the talk. You can clone the data and play with it yourself. Please submit any improvements as pull requests # # [https://github.com/jseabold/538model](https://github.com/jseabold/538model) # <codecell> import datetime import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt import pandas from scipy import stats np.set_printoptions(precision=4, suppress=True) pandas.set_printoptions(notebook_repr_html=False, precision=4, max_columns=12, column_space=10, max_colwidth=25) # <codecell> today = datetime.datetime(2012, 10, 2) # <headingcell level=2> # Outline # <markdowncell> # Methodology was obtained from the old [538 Blog](http://www.fivethirtyeight.com/2008/03/frequently-asked-questions-last-revised.html) with updates at the [new site hosted by the New York Times](http://fivethirtyeight.blogs.nytimes.com/methodology/) # <markdowncell>
import sys from scipy.stats.mstats import f_oneway import pandas as pd data = pd.read_csv(sys.argv[1]) numbers = data.dtypes[data.dtypes == float].index groups = data.dtypes[data.dtypes != float].index pd.set_printoptions(max_columns=100) for group in groups: grouped = data.groupby(group) ave = grouped.mean() ave['#'] = data[group].value_counts() for number in numbers: F, prob = f_oneway(*grouped[number].values) ave = ave[ave[number].notnull() & (ave['#'] > 10)] if len(ave): print '\n%s by %s: %0.3f' % (number, group, prob) print ave.sort(number, ascending=False).head(10)
# <codecell> e = pd.ExcelWriter("/data/adrian/Dropbox/pd_tf_enrichment_20130820.xls") # <codecell> e = open("/data/adrian/Dropbox/ptables/pd_tf_enrichment_20130822.html","w") # <codecell> out_path = "/data/adrian/Dropbox/ptables" # <codecell> pd.set_printoptions(max_colwidth=10000) # <codecell> top_tfs = chea_binding # <codecell> for k in top_tfs.keys(): if top_tfs[k] is not None: print k t = top_tfs[k].sort_index(by="pval", ascending=True)[0:30].merge(tf_target_counts, left_on="tf_name", right_index=True) e.write("<hr>") e.write("<h2>" + k + "</h2>") m = top_motifs[k].merge(mm9_motif_gene_counts, left_on="motif_name", right_index=True).sort_index(by="pval", ascending=True)[0:30].drop(["motif_name_x", "motif_name_y"], axis=1)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.stats as stats import random from credit_card_data import read_data pd.set_printoptions(max_rows=50, max_columns=21) df = read_data() pos = df[df.anomaly] neg = df[df.anomaly == False] pmf_pos = pos.groupby('field4').size().apply(lambda x: float(x) / len(pos)) pmf_neg = neg.groupby('field4').size().apply(lambda x: float(x) / len(neg)) pmf_neg.plot(c='b'); pmf_pos.plot(c='r') plt.show() plt.clf() def discrete_distribution(xk, pk): return stats.rv_discrete(name='hour1',values=(xk,pk)) xk_pos = tuple(pmf_pos.index) pk_pos = tuple(pmf_pos.values) rv_pos = discrete_distribution(xk_pos, pk_pos) xk_neg = tuple(pmf_neg.index) pk_neg = tuple(pmf_neg.values) rv_neg = discrete_distribution(xk_neg, pk_neg) # estimate the mean value of `field4` for anomalous transactions
import pandas as pd import os from pandas import * import numpy as np # cleaning 2006 data df2006=pd.read_csv('/home/yashu/Desktop/assignmentclasses/homework_04/2006.csv') # Change the path location pd.set_printoptions(max_columns=841) # Optional: Testing to print all the columns. total_count=df2006.ix[:,0].count() # Counting total number of rows including missing df2006=df2006.applymap(lambda x : np.nan if x == -999 else x) # applymap iterates through every cell in dataframe required_col=[]# declare an empty list to store the required list for col in df2006.columns: if float(float(df2006[col].count())/float(total_count))>=0.7: #the cut off is 30%, if the data has more than 30% missing the columns are chopped required_col.append(col)# Storing the required columns df2006_processed=df2006[required_col] # cleaning 2010 data df2010=pd.read_csv('/home/yashu/Desktop/assignmentclasses/homework_04/2010.csv') total_count_1=df2010.ix[:,0].count() # Counting total number of rows including missing df2010=df2010.applymap(lambda x : np.nan if x == -999 else x) # applymap iterates through every cell in dataframe required_col_1=[]# declare an empty list to store the required list for col in df2010.columns: if float(float(df2010[col].count())/float(total_count_1))>=0.7: #the cut off is 30%, if the data has more than 30% missing the columns are chopped required_col_1.append(col)# Storing the required columns df2010_processed=df2010[required_col_1] #Reindexing the values col1 = df2006_processed.columns.intersection(df2010_processed.columns) df2006_clean = df2006_processed.reindex(columns=col1) df2010_clean = df2010_processed.reindex(columns=col1)
# For a given entry in the index, identify which week it belongs to (multiple of # 7). which_week = lambda x: (x-data_df.index[0]).days / 7 weekly_group = data_df.groupby(which_week) print "Weekly data:", weekly_group.mean() print # 9. Plots monthly_data = monthly_group.mean() from matplotlib import pyplot monthly_data.plot() # Force this plot to happen in a separate figure pyplot.figure() monthly_data.boxplot() pyplot.show() # 10. This is just another way to group records: unique_monthly_grouped = data_df.groupby(lambda d: (d.month, d.year)) print '10. Mean wind speed for each month in each location' print unique_monthly_grouped.mean() print # 11. Weekly stats over the first year first_year = data_df.ix[:52*7,:] weekly_first_year = first_year.groupby(which_week) stats = weekly_first_year.apply(lambda x: x.describe()) import pandas pandas.set_printoptions(max_rows=500, max_columns = 15, notebook_repr_html=False) print stats
# Use the source: https://github.com/statsmodels/statsmodels # <headingcell level=3> # Tutorial Import Assumptions # <codecell> import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt import pandas from scipy import stats np.set_printoptions(precision=4, suppress=True) pandas.set_printoptions(notebook_repr_html=False, precision=4, max_columns=12) # <headingcell level=3> # Statsmodels Import Convention # <codecell> import statsmodels.api as sm # <markdowncell> # Import convention for models for which a formula is available. # <codecell>