示例#1
0
def print_stages(test_y, stage_predictions, test_metric):
    if hasattr(ml_metrics, test_metric):
        eval_metric = getattr(ml_metrics, test_metric)
    else:
        eval_metric = getattr(skMetrics, test_metric)
    count = 0
    iters = []
    loss = []
    for prediction in stage_predictions:
        count += 1
        if count in [1, 5, 10, 30] or count % 50 == 0:
            iters.append(count)
            loss.append(eval_metric(test_y, prediction))
    loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss})
    loss_df.rename(columns={'Loss': test_metric}, inplace=True)
    pd.set_printoptions(max_columns=len(loss_df.columns), max_rows=len(loss_df))
    print("Loss:")
    print(loss_df)
def print_stages(test_y, stage_predictions, test_metric):
    if hasattr(ml_metrics, test_metric):
        eval_metric = getattr(ml_metrics, test_metric)
    else:
        eval_metric = getattr(skMetrics, test_metric)
    count = 0
    iters = []
    loss = []
    for prediction in stage_predictions:
        count += 1
        if count in [1, 5, 10, 30] or count % 50 == 0:
            iters.append(count)
            loss.append(eval_metric(test_y, prediction))
    loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss})
    loss_df.rename(columns={'Loss': test_metric}, inplace=True)
    pd.set_printoptions(max_columns=len(loss_df.columns), max_rows=len(loss_df))
    print("Loss:")
    print(loss_df)
示例#3
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from credit_card_data import read_data

pd.set_printoptions(max_rows=50, max_columns=21)


def make_hist(frame, to_bin, norm=False):

    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot("111")

    ylab = 'Probability' if norm else 'Count'
    xlab = frame[to_bin].name

    ax.set_ylabel(ylab, fontsize=12, color='blue')
    ax.set_xlabel(xlab, fontsize=12, color='blue')

    ticks = range(24)  # military times
    labels = map(lambda x: str("%s:00" % x), ticks)
    plt.xticks(ticks, labels)

    if not norm: ax.set_ylim(0, 8000)

    for tick in plt.gca().xaxis.iter_ticks():
        tick[0].label2On = True
        tick[0].label1On = False
        tick[0].label2.set_rotation('vertical')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sys import maxsize
pd.set_printoptions(threshold=maxsize)
print('#', '~' * 15, 'Load Data', '~' * 50)
od_one = pd.read_table(
    "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatest.txt",
    sep=',')
od_two = pd.read_table(
    "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatraining.txt",
    sep=',')
od_three = pd.read_table(
    "/Users/thomaskeeley/Documents/school/data_mining/occupancy_data/datatest2.txt",
    sep=',')

print(list(od_one))
print(list(od_two))
print(list(od_three))

#Combine datasets
od_full = pd.concat([od_one, od_two, od_three])

#Split the DateTime feature to separate columns
od_full[['date',
         'time']] = od_full.date.apply(lambda x: pd.Series(str(x).split(" ")))
od_full.head()
import pandas as pd
from datetime import datetime
from pandas.io.data import DataReader

pd.set_printoptions(max_rows=2000)


aapl = DataReader("AAPL", "yahoo", datetime(2010, 1, 1), datetime(2012, 10, 30))
returns = aapl.pct_change()

def f(x):
    if x > 0.01:
        return 1
    elif x < -0.01:
        return -1
    else:
        return 0

frame = returns.applymap(f)

frame['UP'] = frame['Adj Close'] == 1
frame['DOWN'] = frame['Adj Close'] == -1
frame['NONE'] = frame['Adj Close'] ==0

from lxml import etree
import datetime
from dateutil.parser import parse
path = "http://www.google.com/finance/company_news?q=NASDAQ:AAPL&output=rss&num=500"
root = etree.parse(path)
myRoot = root.getroot()
news={}
示例#6
0
# Must have xlrd module installed for ExcelFile to work

import pandas as pandas
import matplotlib.pyplot as plt
import scikits.statsmodels.tools.tools as tools
import numpy as np
from matplotlib.ticker import MultipleLocator

# Open the Excel file
xls = pandas.ExcelFile(r'D:\tomcat\webapps\DensoOBD\files\35400\08-12M HAM Door Jamb Switch.xls')
# Parse the specific Excel sheet name
df = xls.parse('Claims')

pandas.set_printoptions(precision=6, max_columns=12)

data = df[(df['MODEL_YEAR']==2009)
        & (df['FACTORY_CODE']=='ELP')
        & (df['MODEL_NAME']=='CRV')]

counts = data.groupby(['RO_DEALER_STATE'])['VIN'].count()

qty = counts.values
state = counts.index

# zip the qty and state lists together so we can sort it by qty
data_list_numeric = zip(qty, state)
data_list_numeric.sort()
data_list_numeric.reverse()

xlabels = []
yvalues = []
示例#7
0
import pandas as pd
from pandas.io.data import DataReader
from datetime import datetime

import sqlite3 
import pandas.io.sql as psql
import csv

import timeit
from django.utils.encoding import force_unicode


pd.set_printoptions(max_colwidth = 400)

path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"


con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES)

#get rows with polling in text
sqlPolling = 'SELECT [twitter.text], Newsworthy \
            FROM HT_Annotated WHERE ([twitter.text] LIKE "% polling %"  )'

polling = psql.read_frame(sqlPolling , con)
polling.head()
polling = pd.DataFrame(polling)

#Get rows with wordcount > 12
func_wordcounts = lambda x: len(x.strip().split(" "))
import sys
import pandas as pd

data = pd.read_csv(sys.argv[1])
numbers = data.dtypes[data.dtypes == float].index
groups  = data.dtypes[data.dtypes != float].index

pd.set_printoptions(max_columns=100, max_rows=5000)

for group in groups:
    ave = data.groupby(group).mean()
    ave['#'] = data[group].value_counts()
    for number in numbers:
        print '\n%s by %s' % (number, group)
        ave = ave[ave[number].notnull() & (ave['#'] > 10)]
        print ave.sort(number, ascending=False)
示例#9
0
from __future__ import division
from copy import copy
import itertools
import glob
import os

import unittest
import pymc as pm
import numpy as np
import pandas as pd
import nose
pd.set_printoptions(precision=4)
from nose import SkipTest

import hddm
from hddm.diag import check_model


def add_outliers(data, p_outlier):
    """add outliers to data. half of the outliers will be fast, and the rest will be slow
    Input:
        data - data
        p_outliers - probability of outliers
    """
    data = pd.DataFrame(data)

    #generating outliers
    n_outliers = int(len(data) * p_outlier)
    outliers = data[:n_outliers].copy()

    #fast outliers
# For a given entry in the index, identify which week it belongs to (multiple of
# 7).
which_week = lambda x: (x - data_df.index[0]).days / 7
weekly_group = data_df.groupby(which_week)
print "Weekly data:", weekly_group.mean()
print

# 9. Plots
monthly_data = monthly_group.mean()
from matplotlib import pyplot
monthly_data.plot()
# Force this plot to happen in a separate figure
pyplot.figure()
monthly_data.boxplot()
pyplot.show()

# 10. This is just another way to group records:
unique_monthly_grouped = data_df.groupby(lambda d: (d.month, d.year))
print '10. Mean wind speed for each month in each location'
print unique_monthly_grouped.mean()
print

# 11. Weekly stats over the first year
first_year = data_df.ix[:52 * 7, :]
weekly_first_year = first_year.groupby(which_week)
stats = weekly_first_year.apply(lambda x: x.describe())
import pandas
pandas.set_printoptions(max_rows=500, max_columns=15, notebook_repr_html=False)
print stats
def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file,
                      model_type, model_file, fit_args, test_metric, na_fill_value,
                      silent):

    start = timeit.default_timer()

    train_x = pd.read_csv(train_data_file)

    mappings = dict()
    for col in train_x.columns:
        if train_x[col].dtype == np.dtype('object'):
            s = np.unique(train_x[col].fillna(na_fill_value).values)
            mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s)
            train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value)
        else:
            train_x[col] = train_x[col].fillna(na_fill_value)
    train_y = train_x[target_col]
    del train_x[target_col]

    x_cols = train_x.columns
    feat_importance_fun = lambda (fitted_model): fitted_model.feature_importances_
    staged_predict = lambda (fitted_model, pred_x): [fitted_model.predict(pred_x)]
    predict = lambda (fitted_model, pred_x): fitted_model.predict(pred_x)

    model = None
    if model_type == "RandomForestRegressor":
        model = RandomForestRegressor(**fit_args)
        model.fit(X=train_x, y=train_y)

    elif model_type == "GradientBoostingRegressor":
        model = GradientBoostingRegressor(**fit_args)
        model.fit(X=train_x, y=train_y)
        staged_predict = lambda (fitted_model, pred_x): fitted_model.staged_predict(pred_x)

    save_model(model=model, model_file=model_file)

    del train_x, train_y

    test_x = pd.read_csv(test_data_file)
    for col in test_x.columns:
        if col in mappings:
            test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value)
        else:
            test_x[col] = test_x[col].fillna(na_fill_value)

    test_y = None
    if target_col in test_x.columns:
        test_y = test_x[target_col][test_x[target_col] != na_fill_value]
        if len(test_y) != len(test_x):
            test_y = None

    test_x = test_x[x_cols]

    test_pred = pd.DataFrame({'pred': predict((model, test_x))})
    if not silent and test_y is not None:
        print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)), test_metric=test_metric)

    if not silent:
        feat_importance = feat_importance_fun(model)
        if feat_importance is not None:
            feat_importance = pd.DataFrame({'Features': x_cols,
                                            'Importance': feat_importance})
            pd.set_printoptions(max_columns=len(test_x.columns), max_rows=len(test_x.columns))
            print("Feature importances:")
            feat_importance.sort(columns='Importance', ascending=False, inplace=True)
            feat_importance.index = range(1, len(feat_importance) + 1)
            print(feat_importance)

    test_pred.to_csv(test_pred_file, index=False)
    stop = timeit.default_timer()
    if not silent:
        print "Time: %d s" % (stop - start)
示例#12
0
import pandas as pd
from datetime import datetime
from pandas.io.data import DataReader

pd.set_printoptions(max_rows=2000)

aapl = DataReader("AAPL", "yahoo", datetime(2010, 1, 1),
                  datetime(2012, 10, 30))
returns = aapl.pct_change()


def f(x):
    if x > 0.01:
        return 1
    elif x < -0.01:
        return -1
    else:
        return 0


frame = returns.applymap(f)

frame['UP'] = frame['Adj Close'] == 1
frame['DOWN'] = frame['Adj Close'] == -1
frame['NONE'] = frame['Adj Close'] == 0

from lxml import etree
import datetime
from dateutil.parser import parse
path = "http://www.google.com/finance/company_news?q=NASDAQ:AAPL&output=rss&num=500"
root = etree.parse(path)
示例#13
0
from pandas.io.data import DataReader
from datetime import datetime
from pandas import *
import pandas
pandas.set_printoptions(max_rows=2000, max_columns=14)
#from talib import abstract
import talib
#msft = DataReader("MSFT",  "yahoo")
#msft = DataReader("MSFT",  "yahoo", datetime(2009,1,1))
#print msft["Volume"]
#print msft["Adj Close"][-100:]
#print msft
vticker = "GOOG"
goog = DataReader(vticker,  "yahoo", datetime(2009,1,1))
#print goog["Close"]
#print goog

#goog2 = goog["Close"].shift(1, freq=datetools.bday)

#goog2 = goog + goog
#print goog2.index

#pandas.set_printoptions(max_rows=2000, max_columns=10)

## Calculate NDC and Previous closes
goog["NDC"] =  goog["Close"].shift(-1)
goog["close-1"] =  goog["Close"].shift(1)
goog["close-2"] =  goog["Close"].shift(2)
goog["close-3"] =  goog["Close"].shift(3)
goog["close-4"] =  goog["Close"].shift(4)
goog["close-5"] =  goog["Close"].shift(5)
示例#14
0
def crawlRawData(lXall):
      """
      crawling raw data
      """
      print("Crawling raw data...")
      basedir='../stumbled_upon/raw_content/'
      pfacebook = re.compile("www.{1,2}facebook.{1,2}com")
      pfacebook2 = re.compile("developers.{1,2}facebook.{1,2}com.{1,2}docs.{1,2}reference.{1,2}plugins.{1,2}like|facebook.{1,2}com.{1,2}plugins.{1,2}like")
      plinkedin = re.compile("platform.{1,2}linkedin.{1,2}com")
      ptwitter = re.compile("twitter.{1,2}com.{1,2}share")
      prss=re.compile("rss feed",re.IGNORECASE)
      pgooglep=re.compile("apis.{1,2}google.{1,2}com")
      #pstumble=re.compile("www.{1,2}stumbleupon.{1,2}com")
      pstumble=re.compile("stumbleupon")
      pcolor=re.compile("colorscheme|color_scheme|color=|color:",re.IGNORECASE)
      psignup=re.compile("signup|register|login|sign up",re.IGNORECASE)
      pcomment=re.compile("leave a comment|leave comment",re.IGNORECASE)
      pncomment=re.compile("comment-",re.IGNORECASE)
      pmail=re.compile("email",re.IGNORECASE)
      ppics=re.compile("\.png|\.tif|\.jpg",re.IGNORECASE)
      pgif=re.compile("\.gif",re.IGNORECASE)
      psmile=re.compile(":-\)|;-\)")
      plbreak=re.compile("<br>")
      psearch=re.compile("searchstring|customsearch|searchcontrol|searchquery|searchform|searchbox",re.IGNORECASE)
      pcaptcha=re.compile("captcha",re.IGNORECASE)
      padvert=re.compile("advertis",re.IGNORECASE)
      pnewline=re.compile("\n")
      pgooglead=re.compile("google_ad_client")
      phtml5=re.compile("html5",re.IGNORECASE)
      phuff=re.compile("www.huffingtonpost.com",re.IGNORECASE)
      pflash=re.compile("shockwave-flash",re.IGNORECASE)
      pdynlink=re.compile("<a href.+?.+>")
      pnofollow=re.compile("rel=\"nofollow\"",re.IGNORECASE)
      pschemaorg=re.compile("schema\.org",re.IGNORECASE)
      pmobileredirect=re.compile("mobile redirect",re.IGNORECASE)
      
      #pshare=re.compile("sharearticle|share.{1,20}article",re.IGNORECASE)
      plang=re.compile("en-US|en_US",re.IGNORECASE)
      tutto=[]
      for ind in lXall.index:
	  row=[]
	  nl=1.0+lXall.ix[ind,'numberOfLinks']
	  nchar=1.0+lXall.ix[ind,'non_markup_alphanum_characters']
	  #print "numberOfLinks:",nl
	  with open(basedir+str(ind), 'r') as content_file:
	    content = content_file.read()
	    #print "id:",ind,
	    row.append(ind)
	    
	    res = pfacebook.findall(content)
	    row.append(len(res)/float(nl))
	    
	    res = pfacebook2.findall(content)	    
	    row.append(len(res)/float(nl))
	    
	    res = ptwitter.findall(content)
	    row.append(len(res)/float(nl))
	
	    
	    #res = prss.findall(content)
	    #row.append(len(res)/float(nl))
	    
	    #res = pgooglep.findall(content)	    
	    #row.append(len(res)/float(nl))
	    
	    #res = pstumble.findall(content)	    
	    #row.append(len(res)/float(nl))
	    
	    res = pncomment.findall(content)	    
	    row.append(len(res))
	    
	    #res = pcolor.findall(content)	    
	    #row.append(len(res))
	    
	    #res = psmile.findall(content)	    
	    #row.append(len(res))
	    
	    #if len(res)>0:
		#print ind,": ",res
		#raw_input("HITKEY")
	    
	    #res = plbreak.findall(content)	    
	    #row.append(len(res))
	    
	    #res = padvert.findall(content)	    
	    #row.append(len(res))
	    
	    res = pnewline.findall(content)	    
	    row.append(math.log(1.0+len(res)))
	    
	    #res = pdynlink.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pnofollow.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pschemaorg.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pmobileredirect.findall(content)	    
	    #row.append(len(res))
	    
	    
	    
	    #m = pgooglead.search(content)
	    #if m:
	#	row.append(1)
	 #   else:
	#	row.append(0)
	    
	    #if len(res)>0:
		#print ind,": ",res
		#raw_input("HITKEY")

		
	    #res = pshare.findall(content)
	    #row.append(len(res)/float(nl))
	  #print ""
	  tutto.append(row)
      newdf=pd.DataFrame(tutto).set_index(0)
      newdf.columns=['wwwfacebook_ratio','facebooklike_ratio','twitter_ratio','n_comment','logn_newline']
      pd.set_printoptions(max_rows=40, max_columns=20)
      print(newdf.head(20))
      print(newdf.describe())
      return newdf
示例#15
0
def crawlRawData(lXall):
      """
      crawling raw data
      """
      print "Crawling raw data..."
      basedir='../stumbled_upon/raw_content/'
      pfacebook = re.compile("www.{1,2}facebook.{1,2}com")
      pfacebook2 = re.compile("developers.{1,2}facebook.{1,2}com.{1,2}docs.{1,2}reference.{1,2}plugins.{1,2}like|facebook.{1,2}com.{1,2}plugins.{1,2}like")
      plinkedin = re.compile("platform.{1,2}linkedin.{1,2}com")
      ptwitter = re.compile("twitter.{1,2}com.{1,2}share")
      prss=re.compile("rss feed",re.IGNORECASE)
      pgooglep=re.compile("apis.{1,2}google.{1,2}com")
      #pstumble=re.compile("www.{1,2}stumbleupon.{1,2}com")
      pstumble=re.compile("stumbleupon")
      pcolor=re.compile("colorscheme|color_scheme|color=|color:",re.IGNORECASE)
      psignup=re.compile("signup|register|login|sign up",re.IGNORECASE)
      pcomment=re.compile("leave a comment|leave comment",re.IGNORECASE)
      pncomment=re.compile("comment-",re.IGNORECASE)
      pmail=re.compile("email",re.IGNORECASE)
      ppics=re.compile("\.png|\.tif|\.jpg",re.IGNORECASE)
      pgif=re.compile("\.gif",re.IGNORECASE)
      psmile=re.compile(":-\)|;-\)")
      plbreak=re.compile("<br>")
      psearch=re.compile("searchstring|customsearch|searchcontrol|searchquery|searchform|searchbox",re.IGNORECASE)
      pcaptcha=re.compile("captcha",re.IGNORECASE)
      padvert=re.compile("advertis",re.IGNORECASE)
      pnewline=re.compile("\n")
      pgooglead=re.compile("google_ad_client")
      phtml5=re.compile("html5",re.IGNORECASE)
      phuff=re.compile("www.huffingtonpost.com",re.IGNORECASE)
      pflash=re.compile("shockwave-flash",re.IGNORECASE)
      pdynlink=re.compile("<a href.+?.+>")
      pnofollow=re.compile("rel=\"nofollow\"",re.IGNORECASE)
      pschemaorg=re.compile("schema\.org",re.IGNORECASE)
      pmobileredirect=re.compile("mobile redirect",re.IGNORECASE)
      
      #pshare=re.compile("sharearticle|share.{1,20}article",re.IGNORECASE)
      plang=re.compile("en-US|en_US",re.IGNORECASE)
      tutto=[]
      for ind in lXall.index:
	  row=[]
	  nl=1.0+lXall.ix[ind,'numberOfLinks']
	  nchar=1.0+lXall.ix[ind,'non_markup_alphanum_characters']
	  #print "numberOfLinks:",nl
	  with open(basedir+str(ind), 'r') as content_file:
	    content = content_file.read()
	    #print "id:",ind,
	    row.append(ind)
	    
	    res = pfacebook.findall(content)
	    row.append(len(res)/float(nl))
	    
	    res = pfacebook2.findall(content)	    
	    row.append(len(res)/float(nl))
	    
	    res = ptwitter.findall(content)
	    row.append(len(res)/float(nl))
	
	    
	    #res = prss.findall(content)
	    #row.append(len(res)/float(nl))
	    
	    #res = pgooglep.findall(content)	    
	    #row.append(len(res)/float(nl))
	    
	    #res = pstumble.findall(content)	    
	    #row.append(len(res)/float(nl))
	    
	    res = pncomment.findall(content)	    
	    row.append(len(res))
	    
	    #res = pcolor.findall(content)	    
	    #row.append(len(res))
	    
	    #res = psmile.findall(content)	    
	    #row.append(len(res))
	    
	    #if len(res)>0:
		#print ind,": ",res
		#raw_input("HITKEY")
	    
	    #res = plbreak.findall(content)	    
	    #row.append(len(res))
	    
	    #res = padvert.findall(content)	    
	    #row.append(len(res))
	    
	    res = pnewline.findall(content)	    
	    row.append(math.log(1.0+len(res)))
	    
	    #res = pdynlink.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pnofollow.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pschemaorg.findall(content)	    
	    #row.append(len(res))
	    
	    #res = pmobileredirect.findall(content)	    
	    #row.append(len(res))
	    
	    
	    
	    #m = pgooglead.search(content)
	    #if m:
	#	row.append(1)
	 #   else:
	#	row.append(0)
	    
	    #if len(res)>0:
		#print ind,": ",res
		#raw_input("HITKEY")

		
	    #res = pshare.findall(content)
	    #row.append(len(res)/float(nl))
	  #print ""
	  tutto.append(row)
      newdf=pd.DataFrame(tutto).set_index(0)
      newdf.columns=['wwwfacebook_ratio','facebooklike_ratio','twitter_ratio','n_comment','logn_newline']
      pd.set_printoptions(max_rows=40, max_columns=20)
      print newdf.head(20)
      print newdf.describe()
      return newdf
# <codecell>

# a summary of the data is printed when there's "too much" to display
print(df)

# <codecell>

# in this "too much" case, use "columns" and "values"
# to peak at first 3 rows of data
print(df.columns)
print(df[0:3].values)

# <codecell>

# or better yet, change what "too much" means
pd.set_printoptions(max_columns=27)
print(df[0:3])

# <markdowncell>

# Now for the subsetting features...

# <codecell>

# create a data frame consisting of a subset of columns
dfsubsetcols = df[['W', 'Q2', 'xsect', 'err']]
print(dfsubsetcols)

# <codecell>

dfsubsetrows = df[0:3]
示例#17
0
def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file,
                      model_type, model_file, fit_args, test_metric, na_fill_value,
                      silent):

    start = timeit.default_timer()
    train_x = pd.read_csv(train_data_file)
    mappings = dict()
    for col in train_x.columns:
        if train_x[col].dtype == np.dtype('object'):
            s = np.unique(train_x[col].fillna(na_fill_value).values)
            mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s)
            train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value)
        else:
            train_x[col] = train_x[col].fillna(na_fill_value)
    train_y = train_x[target_col]
    del train_x[target_col]

    x_cols = train_x.columns
    feat_importance_fun = lambda (fitted_model): fitted_model.feature_importances_
    staged_predict = lambda (fitted_model, pred_x): [fitted_model.predict(pred_x)]
    predict = lambda (fitted_model, pred_x): fitted_model.predict(pred_x)

    model = None
    if model_type == "RandomForestRegressor":
        model = RandomForestRegressor(**fit_args)
        model.fit(X=train_x, y=train_y)

    elif model_type == "GradientBoostingRegressor":
        model = GradientBoostingRegressor(**fit_args)
        model.fit(X=train_x, y=train_y)
        staged_predict = lambda (fitted_model, pred_x): fitted_model.staged_predict(pred_x)

    save_model(model=model, model_file=model_file)

    del train_x, train_y

    test_x = pd.read_csv(test_data_file)
    for col in test_x.columns:
        if col in mappings:
            test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value)
        else:
            test_x[col] = test_x[col].fillna(na_fill_value)

    test_y = None
    if target_col in test_x.columns:
        test_y = test_x[target_col][test_x[target_col] != na_fill_value]
        if len(test_y) != len(test_x):
            test_y = None

    test_x = test_x[x_cols]

    test_pred = pd.DataFrame({'pred': predict((model, test_x))})
    if not silent and test_y is not None:
        print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)), test_metric=test_metric)

    if not silent:
        feat_importance = feat_importance_fun(model)
        if feat_importance is not None:
            feat_importance = pd.DataFrame({'Features': x_cols,
                                            'Importance': feat_importance})
            pd.set_printoptions(max_columns=len(test_x.columns), max_rows=len(test_x.columns))
            print("Feature importances:")
            feat_importance.sort(columns='Importance', ascending=False, inplace=True)
            feat_importance.index = range(1, len(feat_importance) + 1)
            print(feat_importance)

    test_pred.to_csv(test_pred_file, index=False)
    stop = timeit.default_timer()
    if not silent:
        print "Time: %d s" % (stop - start)
示例#18
0
from __future__ import division
from copy import copy
import itertools
import kabuki
import os

import unittest
import pymc as pm
import numpy as np
import pandas as pd
import nose
pd.set_printoptions(precision=4)
from nose import SkipTest

import hddm
from hddm.diag import check_model

def diff_model(param, subj=True, num_subjs=10, change=.5, size=500):
    params_cond_a = {'v':.5, 'a':2., 'z':.5, 't': .3, 'st':0., 'sv':0., 'sz':0.}
    params_cond_b = copy(params_cond_a)
    params_cond_b[param] += change

    params = {'A': params_cond_a, 'B': params_cond_b}

    data, subj_params = hddm.generate.gen_rand_data(params, subjs=num_subjs, size=size)

    model = hddm.models.HDDMTruncated(data, depends_on={param:['condition']}, is_group_model=subj)

    return model

class TestMulti(unittest.TestCase):
示例#19
0
'''
Created on May 2, 2013

@author: phcostello
'''

import pandas as pd
import numpy as np
from pandas.io.data import DataReader
from datetime import datetime
import sqlite3
import pandas.io.sql as psql
import csv
import timeit

pd.set_printoptions(max_colwidth=400)
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"


def showTables(con, display=True):

    cur = con.cursor()
    cur.execute("SELECT name, type FROM sqlite_master")
    tbls = pd.DataFrame(cur.fetchall())
    if (display):
        print tbls
    return tbls


def readDB(con, table, startDate, endDate, DateField=None, fields=None):
示例#20
0
# <codecell>

starAlias = np.core.defchararray.replace(starAlias,'#','')

# <codecell>

starAliasUnique = []

maxLen = 0 
for i,thisStar in enumerate(np.unique(starAlias[:,0])):
    
    x = starAlias[np.where(thisStar==starAlias)[0],1]
    thisStarAliases = ", ".join(x)
    starAliasUnique.append([thisStar,thisStarAliases])
    
    #Get max len of alias column
    if len(thisStarAliases)>maxLen: maxLen =len(thisStarAliases)


print maxLen, 'is the length of the longest column'
starAliasUnique = np.array(starAliasUnique)

# <codecell>

labels = ['Name','Alias']
a = pd.DataFrame(starAliasUnique, dtype=str)
a.columns = labels
pd.set_printoptions(max_colwidth=maxLen)
print a.to_latex(index=False)

import sys
from scipy.stats.mstats import f_oneway
import pandas as pd

data = pd.read_csv(sys.argv[1])
numbers = data.dtypes[data.dtypes == float].index
groups  = data.dtypes[data.dtypes != float].index

pd.set_printoptions(max_columns=100)

results = []
for group in groups:
    grouped = data.groupby(group)
    ave = grouped.mean()
    ave['#'] = data[group].value_counts()
    for number in numbers:
        F, prob = f_oneway(*grouped[number].values)
        improvement = ave[number].max() / data[number].mean() - 1
        if prob < .05:
            ave = ave[ave[number].notnull() & (ave['#'] > 10)]
            results.append([group, number, improvement, prob,
                ave.sort(number, ascending=False)])

for group, number, improvement, prob, ave in sorted(results, key=lambda v: v[2], reverse=True):
    print '\n%s by %s: %0.1f%% (%0.3f)' % (number, group, 100 * improvement, prob)
    # print '-' * 80
    # print ave.head(10)
    # if len(ave) > 20:
    #     print ave.tail(10)
示例#22
0
# IPython log file

get_ipython().magic(u'pinfo %logstart')
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
from pandas.tools.plotting import scatter_matrix
# need to manually set the print options in order to get data to display
pd.set_printoptions(max_columns=99)
# load the data using same command as last time
mrw_1992_data = sm.iolib.foreign.genfromdta('mrw1992.dta',
                                            missing_flt=NaN,
                                            missing_str=NaN,
                                            pandas=True)
print type(mrw_1992_data)
# view the entire dataset
mrw_1992_data
#[Out]#      c_index             c_name c_code  cont  nonoil  inter  oecd  gdp60  gdp85  popgrowth       igdp  school
#[Out]# 0          1            Algeria    DZA     1       1      1     0   2485   4371        2.6  24.100000     4.5
#[Out]# 1          2             Angola    AGO     1       1      0     0   1588   1171        2.1   5.800000     1.8
#[Out]# 2          3              Benin    BEN     1       1      0     0   1116   1071        2.4  10.800000     1.8
#[Out]# 3          4           Botswana    BWA     1       1      1     0    959   3671        3.2  28.299999     2.9
#[Out]# 4          5       Burkina Faso    BFA     1       1      0     0    529    857        0.9  12.700000     0.4
#[Out]# 5          6            Burundi    BDI     1       1      0     0    755    663        1.7   5.100000     0.4
#[Out]# 6          7           Cameroon    CMR     1       1      1     0    889   2190        2.1  12.800000     3.4
#[Out]# 7          8  Central Afr. Rep.    CAF     1       1      0     0    838    789        1.7  10.500000     1.4
#[Out]# 8          9               Chad    TCD     1       1      0     0    908    462        1.9   6.900000     0.4
#[Out]# 9         10           PR Congo    RCB     1       1      0     0   1009   2624        2.4  28.799999     3.8
#[Out]# 10        11              Egypt    EGY     1       1      0     0    907   2160        2.5  16.299999     7.0
grouped.mean()

# <headingcell level=4>

# Time-Series

# <codecell>

dates = pd.date_range('1/1/2000', periods=50)
df = pd.DataFrame(np.random.randn(50, 4), index=dates, 
                  columns=['A', 'B', 'C', 'D'])

# <codecell>

pd.set_printoptions(max_rows=49)

# <codecell>

df

# <codecell>

df.head(5)

# <codecell>

df.ix["2000-1-5":"2000-1-15"]

# <codecell>
示例#24
0
# IPython log file

get_ipython().magic(u'pinfo %logstart')
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
from pandas.tools.plotting import scatter_matrix
# need to manually set the print options in order to get data to display
pd.set_printoptions(max_columns=99)
# load the data using same command as last time
mrw_1992_data = sm.iolib.foreign.genfromdta('mrw1992.dta', missing_flt=NaN, missing_str=NaN, pandas=True)
print type(mrw_1992_data)
# view the entire dataset
mrw_1992_data
#[Out]#      c_index             c_name c_code  cont  nonoil  inter  oecd  gdp60  gdp85  popgrowth       igdp  school
#[Out]# 0          1            Algeria    DZA     1       1      1     0   2485   4371        2.6  24.100000     4.5
#[Out]# 1          2             Angola    AGO     1       1      0     0   1588   1171        2.1   5.800000     1.8
#[Out]# 2          3              Benin    BEN     1       1      0     0   1116   1071        2.4  10.800000     1.8
#[Out]# 3          4           Botswana    BWA     1       1      1     0    959   3671        3.2  28.299999     2.9
#[Out]# 4          5       Burkina Faso    BFA     1       1      0     0    529    857        0.9  12.700000     0.4
#[Out]# 5          6            Burundi    BDI     1       1      0     0    755    663        1.7   5.100000     0.4
#[Out]# 6          7           Cameroon    CMR     1       1      1     0    889   2190        2.1  12.800000     3.4
#[Out]# 7          8  Central Afr. Rep.    CAF     1       1      0     0    838    789        1.7  10.500000     1.4
#[Out]# 8          9               Chad    TCD     1       1      0     0    908    462        1.9   6.900000     0.4
#[Out]# 9         10           PR Congo    RCB     1       1      0     0   1009   2624        2.4  28.799999     3.8
#[Out]# 10        11              Egypt    EGY     1       1      0     0    907   2160        2.5  16.299999     7.0
#[Out]# 11        12           Ethiopia    ETH     1       1      1     0    533    608        2.3   5.400000     1.1
#[Out]# 12        13              Gabon    GAB     1       0      0     0   1307   5350        1.4  22.100000     2.6
#[Out]# 13        14        Gambia, The    GMB     1       0      0     0    799    NaN        NaN  18.100000     1.5
示例#25
0
# <headingcell level=3>

# Tutorial Import Assumptions

# <codecell>

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas
from scipy import stats

np.set_printoptions(precision=4, suppress=True)
pandas.set_printoptions(notebook_repr_html=False,
                        precision=4,
                        max_columns=12)

# <headingcell level=3>

# Statsmodels Import Convention

# <codecell>

import statsmodels.api as sm

# <markdowncell>

# Import convention for models for which a formula is available.

# <codecell>
示例#26
0
# GitHub link for the talk. You can clone the data and play with it yourself. Please submit any improvements as pull requests
# 
# [https://github.com/jseabold/538model](https://github.com/jseabold/538model)

# <codecell>

import datetime

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas
from scipy import stats
np.set_printoptions(precision=4, suppress=True)
pandas.set_printoptions(notebook_repr_html=False,
                        precision=4,
                        max_columns=12, column_space=10,
                        max_colwidth=25)

# <codecell>

today = datetime.datetime(2012, 10, 2)

# <headingcell level=2>

# Outline

# <markdowncell>

# Methodology was obtained from the old [538 Blog](http://www.fivethirtyeight.com/2008/03/frequently-asked-questions-last-revised.html) with updates at the [new site hosted by the New York Times](http://fivethirtyeight.blogs.nytimes.com/methodology/)

# <markdowncell>
import sys
from scipy.stats.mstats import f_oneway
import pandas as pd

data = pd.read_csv(sys.argv[1])
numbers = data.dtypes[data.dtypes == float].index
groups  = data.dtypes[data.dtypes != float].index

pd.set_printoptions(max_columns=100)

for group in groups:
    grouped = data.groupby(group)
    ave = grouped.mean()
    ave['#'] = data[group].value_counts()
    for number in numbers:
        F, prob = f_oneway(*grouped[number].values)
        ave = ave[ave[number].notnull() & (ave['#'] > 10)]
        if len(ave):
            print '\n%s by %s: %0.3f' % (number, group, prob)
            print ave.sort(number, ascending=False).head(10)
示例#28
0
# <codecell>

e = pd.ExcelWriter("/data/adrian/Dropbox/pd_tf_enrichment_20130820.xls")

# <codecell>

e = open("/data/adrian/Dropbox/ptables/pd_tf_enrichment_20130822.html","w")

# <codecell>

out_path = "/data/adrian/Dropbox/ptables"

# <codecell>

pd.set_printoptions(max_colwidth=10000)

# <codecell>

top_tfs = chea_binding

# <codecell>

for k in top_tfs.keys():
    if top_tfs[k] is not None:
        print k
        t = top_tfs[k].sort_index(by="pval", ascending=True)[0:30].merge(tf_target_counts, left_on="tf_name", right_index=True)
        e.write("<hr>")
        e.write("<h2>" + k + "</h2>")
        
        m = top_motifs[k].merge(mm9_motif_gene_counts, left_on="motif_name", right_index=True).sort_index(by="pval", ascending=True)[0:30].drop(["motif_name_x", "motif_name_y"], axis=1)
示例#29
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import random
from credit_card_data import read_data

pd.set_printoptions(max_rows=50, max_columns=21)

df = read_data()
pos = df[df.anomaly]
neg = df[df.anomaly == False]

pmf_pos = pos.groupby('field4').size().apply(lambda x: float(x) / len(pos))
pmf_neg = neg.groupby('field4').size().apply(lambda x: float(x) / len(neg))
pmf_neg.plot(c='b'); pmf_pos.plot(c='r')
plt.show()
plt.clf()

def discrete_distribution(xk, pk):
	return stats.rv_discrete(name='hour1',values=(xk,pk))

xk_pos = tuple(pmf_pos.index)
pk_pos = tuple(pmf_pos.values)
rv_pos = discrete_distribution(xk_pos, pk_pos)

xk_neg = tuple(pmf_neg.index)
pk_neg = tuple(pmf_neg.values)
rv_neg = discrete_distribution(xk_neg, pk_neg)

# estimate the mean value of `field4` for anomalous transactions
示例#30
0
import pandas as pd
import os
from pandas import *
import numpy as np

# cleaning 2006 data
df2006=pd.read_csv('/home/yashu/Desktop/assignmentclasses/homework_04/2006.csv') # Change the path location
pd.set_printoptions(max_columns=841) # Optional: Testing to print all the columns. 
total_count=df2006.ix[:,0].count() # Counting total number of rows including missing
df2006=df2006.applymap(lambda x : np.nan if x == -999 else x) # applymap iterates through every cell in dataframe
required_col=[]# declare an empty list to store the required list
for col in df2006.columns:
    if float(float(df2006[col].count())/float(total_count))>=0.7: #the cut off is 30%, if the data has more than 30% missing the columns are chopped
        required_col.append(col)# Storing the required columns
df2006_processed=df2006[required_col]

# cleaning 2010 data
df2010=pd.read_csv('/home/yashu/Desktop/assignmentclasses/homework_04/2010.csv')
total_count_1=df2010.ix[:,0].count() # Counting total number of rows including missing
df2010=df2010.applymap(lambda x : np.nan if x == -999 else x) # applymap iterates through every cell in dataframe
required_col_1=[]# declare an empty list to store the required list
for col in df2010.columns:
    if float(float(df2010[col].count())/float(total_count_1))>=0.7: #the cut off is 30%, if the data has more than 30% missing the columns are chopped
       required_col_1.append(col)# Storing the required columns
df2010_processed=df2010[required_col_1]

#Reindexing the values
col1 = df2006_processed.columns.intersection(df2010_processed.columns)
df2006_clean = df2006_processed.reindex(columns=col1)
df2010_clean = df2010_processed.reindex(columns=col1)
# For a given entry in the index, identify which week it belongs to (multiple of
# 7).
which_week = lambda x: (x-data_df.index[0]).days / 7
weekly_group = data_df.groupby(which_week)
print "Weekly data:", weekly_group.mean()
print 

# 9. Plots
monthly_data = monthly_group.mean()
from matplotlib import pyplot
monthly_data.plot()
# Force this plot to happen in a separate figure
pyplot.figure()
monthly_data.boxplot()
pyplot.show()

# 10. This is just another way to group records:
unique_monthly_grouped = data_df.groupby(lambda d: (d.month, d.year))
print '10. Mean wind speed for each month in each location'
print unique_monthly_grouped.mean()
print 
  

# 11. Weekly stats over the first year
first_year = data_df.ix[:52*7,:]
weekly_first_year = first_year.groupby(which_week)
stats = weekly_first_year.apply(lambda x: x.describe())
import pandas
pandas.set_printoptions(max_rows=500, max_columns = 15, notebook_repr_html=False)
print stats
示例#32
0
# Use the source: https://github.com/statsmodels/statsmodels

# <headingcell level=3>

# Tutorial Import Assumptions

# <codecell>

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas
from scipy import stats

np.set_printoptions(precision=4, suppress=True)
pandas.set_printoptions(notebook_repr_html=False, precision=4, max_columns=12)

# <headingcell level=3>

# Statsmodels Import Convention

# <codecell>

import statsmodels.api as sm

# <markdowncell>

# Import convention for models for which a formula is available.

# <codecell>