def eval_classifer(self, individual): # self.counter += 1 # if self.counter % 100 == 0: print self.counter / 4000 import importHelper cv = importHelper.load('cvMachine') train = self.X cols_inx = [i for i in range(len(individual)) if individual[i] == 1] train = train[cols_inx] # print cols_inx # print train.head() # print len(train.columns) score = cv.sklearn_cross_validation(self.clf, train, self.Y).mean() # print score # print sum(individual) return score,
def lm_proc(df): lm = importHelper.load('lm') def clean_data_lm(df): def get_xy(df): return df.tweet, df.score df = clean_data(df) weekends = ev.Contain(['Fri', 'Sat', 'Sun', 'Mon']) dfs = [df[df['weekday'] != weekends], df[df['weekday'] == weekends]] return [get_xy(df) for df in dfs] def load_lms(params): import itertools keys = params.keys() params_grid = itertools.product(*(params.values())) return [lm.lm(**dict(zip(keys, par))) for par in params_grid] dfs = clean_data_lm(df) params = { 'a': [0.05, 0.1, 0.15, 0.2, 0.25], 'smooth_method': ['jelinek_mercer', 'dirichlet'] } models_on_dfs = [load_lms(params)] * len(dfs) # models = [m.fit(X, Y) for m, (X, Y) in zip(models, dfs)] scores = {} for models_on_df, (X, Y) in zip(models_on_dfs, dfs): for model in models_on_df: params = model.get_params() # print params avg_score = cv.cross_validation(model, X, Y, avg=True) if params not in scores: scores[params] = [avg_score] else: scores[params].append(avg_score) return scores
def lm_proc(df): lm = importHelper.load('lm') def clean_data_lm(df): def get_xy(df): return df.tweet, df.score df = clean_data(df) weekends = ev.Contain(['Fri', 'Sat', 'Sun', 'Mon']) dfs = [df[df['weekday'] != weekends], df[df['weekday'] == weekends]] return [get_xy(df) for df in dfs] def load_lms(params): import itertools keys = params.keys() params_grid = itertools.product(*(params.values())) return [lm.lm(**dict(zip(keys,par))) for par in params_grid] dfs = clean_data_lm(df) params = {'a': [0.05, 0.1, 0.15, 0.2, 0.25], 'smooth_method': ['jelinek_mercer','dirichlet'] } models_on_dfs = [load_lms(params)]*len(dfs) # models = [m.fit(X, Y) for m, (X, Y) in zip(models, dfs)] scores = {} for models_on_df, (X, Y) in zip(models_on_dfs, dfs): for model in models_on_df: params = model.get_params() # print params avg_score = cv.cross_validation(model, X, Y, avg=True) if params not in scores: scores[params] = [avg_score] else: scores[params].append(avg_score) return scores
import matplotlib.pyplot as plt import pandas as pd import importHelper from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.stem.porter import PorterStemmer import re pd.options.mode.chained_assignment = None lm = importHelper.load('lm') proc = importHelper.load('textpreprocessor') cv = importHelper.load('cvMachine') URL = './input/Tweets.csv' def load_data(url=URL): return pd.read_csv(url) def clean_tweet(s): ''' :s : string; a tweet :return : list; words that don't contain url, @somebody, and in utf-8 and lower case ''' extra_patterns=['date','time','url','at']
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext import numpy as np import pandas as pd import matplotlib.pyplot as plt import os import warnings import importHelper from scipy.stats import itemfreq from sklearn.ensemble import RandomForestClassifier cor = importHelper.load("cor") cv = importHelper.load("cvMachine") PATH = '/media/quan/Q/github/expedia-hotel-recommendations/data/' def load_data(sqlContext,url): df = sqlContext.read.load(url, format="com.databricks.spark.csv", header='true', inferSchema='true') return df def sp_histogram(df, col, ax): counts = df.groupby(col).count().sort(col).toPandas() ax.bar(left=range(1,counts.shape[0]+1), height=counts['count'], tick_label=counts[col])
import numpy as np from sklearn.datasets import load_digits, load_boston from sklearn.cross_validation import train_test_split from sklearn.preprocessing import scale import importHelper cv = importHelper.load('cvMachine') import prep def classification_test(clf, classes): X, y = get_classification_data(classes) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # print y_pred # print y_test print sum([1 for i in range(y_test.shape[0]) if y_pred[i] == y_test[i]]) / float(y_test.shape[0]) print cv.confusion_matrix_from_cv(clf, X, y, cv=5) def regression_test(clf): X, y = get_regression_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30) clf.fit(X_train, y_train) print np.sum((clf.predict(X_test)-y_test)**2) def get_classification_data(classes):
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from collections import Counter import ga import wrapper import math trainURL = '../train.csv' testURL = '../test.csv' cv = importHelper.load('cvMachine') proc = importHelper.load('preprocessor') def load_data(url): df = pd.read_csv(url, index_col=0) return df def clean_data(df, test=False): # remove features with almost zero variance plus 'Wilderness_Area4' and 'Soil_Type40' in case of Multicollinearity # remove_list = proc.small_var(df,0.002) # remove_list.extend(['Wilderness_Area3', 'Soil_Type10','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']) remove_list = ['Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type15', 'Soil_Type21', 'Soil_Type25', 'Soil_Type27', 'Soil_Type28', 'Soil_Type34', 'Soil_Type36', 'Wilderness_Area3', 'Soil_Type10','Hillshade_9am','Hillshade_Noon','Hillshade_3pm'] df = df.drop(labels = remove_list, axis=1) # df = df.astype(np.float32) if not test: df = proc.shuffle(df) return df
import pandas as pd from sklearn.ensemble import RandomForestClassifier import importHelper cv = importHelper.load("cvMachine") def main(): df1 = pd.read_csv('./data/train1_booking.csv', index_col=0) # df2 = pd.read_csv('./data/train1_unbooking.csv', index_col=0) X1, y1 = df1[[e for e in df1.columns if e != 'hotel_cluster']], df1['hotel_cluster'] # X2, y2 = df2[[e for e in df2.columns if e != 'hotel_cluster']], df2['hotel_cluster'] rf = RandomForestClassifier() print cv.sklearn_cross_validation(rf, X1, y1) # print cv.sklearn_cross_validation(rf, X2, y2) if __name__ == '__main__': main()
import pandas as pd import numpy as np import importHelper proc = importHelper.load('preprocessor') from sklearn.linear_model import LogisticRegression TESTDATAURL = "test.csv" TRAINDATAURL = "train.csv" use_cols = ["Dates","Category","PdDistrict","X","Y","Address"] use_cols2 = ["Id","Dates","PdDistrict","X","Y","Address"] def load_data(test = False): if test: df = pd.read_csv(TESTDATAURL, usecols = use_cols2, parse_dates=["Dates"], index_col = 'Id') else: df = pd.read_csv(TRAINDATAURL, usecols=use_cols, parse_dates=["Dates"]) return df class sf_preprocessor(proc.preprocessor): """ process data into right format for classification. Outliers are deleted/averaged; features are vectorized; several features are extracted from dates; address is cleaned to determine whether block is in it. """ def __init__(self, df, test=False, yName = None): super(sf_preprocessor, self).__init__(df,test,y_name)
import pandas as pd import importHelper import loader from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier proc = importHelper.load('preprocessor') # cross_validation = importHelper.load('cvMachine') class tfidf_cook_processer(proc.preprocessor): """ process data into right format for classification. tf*idf is applied to transform the data and stop words are considered. items in each recipe is 'cleaned' by extracting the last word of each item. For example, "soy milk" will be "milk" after processing, which can be chosen whether to applied or not by tiny change. the class is inherited by preprocessor in preprocessor.py, see yq911122/module on Github. """ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer cv = CountVectorizer() tfidf = TfidfTransformer() stop_words = [] def __init__(self, df, test=False, y_name = 'cuisine'): super(tfidf_cook_processer, self).__init__(df,test,y_name)
# statistical language model. dirichlet and jelinek mercer discount methods are applied import importHelper import pandas as pd import random everything = importHelper.load('everything') static_vars = everything.static_vars def jelinek_mercer(ct, p_ml,p_ref,a=0.1): from math import log log_p_s = (p_ml*(1-a)+p_ref.loc[p_ml.index]*a).map(log) return log_p_s def dirichlet(ct, p_ml,p_ref,a=0.1): from math import log d = len(p_ml) u = a / (1+a)*d log_p_s = ((ct+u*p_ref.loc[ct.index])/(d+u)).map(log) return log_p_s def fit(df, x_name, y_name, a=0.1, smooth_method=jelinek_mercer): ''' df: DataFrame containing features and category. Features are actually a list of words, standing for the document. x_name: column name of features y_name: column name of the category a: discount parameter; should be tuned via cross validation smooth_method: method selected to discount the probabilities out: language model '''
import pandas as pd import numpy as np from math import log import importHelper entropy = importHelper.load("entropy") from entropy import ent, prob def slice_ent(s): counts = np.bincount(s) vals = np.true_divide(counts, s.shape[0]) return ent(vals), np.sum(vals != 0) # @profile def ent_eval(s, t): """ s: data set, pd.Series of a column as label t: cut point, int as index return: evaluate value """ size = float(s.shape[0]) assert(0 <= t < size),"invalid cut point" s1, s2 = s[:t], s[t+1:] return (s1.shape[0]*slice_ent(s1)[0] + s2.shape[0]*slice_ent(s2)[0])/size # return s1.shape[0]/size*ent(s1)+s2.shape[0]/size*ent(s2)
import pandas as pd import numpy as np import importHelper dt = importHelper.load("discretizer") fs = importHelper.load("featureSelector") entropy = importHelper.load("entropy") proc = importHelper.load("preprocessor") def symmetricalUncertainty(x, y): return 2*entropy.infoGain(x, y)/(entropy.ent(x)+entropy.ent(y)) def process(df, discrete_cols, continus_cols, y_col): x_cols = discrete_cols + continus_cols cuts = dt.EntropyDiscretize(df[continus_cols].values, df[y_col].astype(np.int32).values) print cuts # df[continus_cols] = proc.discretize_df(df, continus_cols, cut) # fs = fs.corrSelector(df[x_cols], df[y_col], symmetricalUncertainty, 0.0) # print fs.process() def main(): df1 = pd.read_csv('./data/train1_booking.csv', index_col=0) df2 = pd.read_csv('./data/train1_unbooking.csv', index_col=0) continus_cols2 = [u'orig_destination_distance', u'cnt'] discrete_cols2 = [u'site_name', u'posa_continent', u'user_location_country',u'user_location_region', u'user_location_city',u'user_id', u'is_mobile', u'is_package', u'channel',u'srch_destination_id', u'srch_destination_type_id', u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt', u'hotel_continent', u'hotel_country', u'hotel_market', u'plan_hour'] continus_extra, discrete_extra = [u'plan_days', u'travel_days'], [u'travel_month'] continus_cols1 = continus_cols2 + continus_extra discrete_cols1 = discrete_cols2 + discrete_extra
import importHelper import pandas as pd import numpy as np import string from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import re ev = importHelper.load('everything') cv = importHelper.load('cvMachine') proc = importHelper.load('textpreprocessor') URL = 'traindata.csv' def load_data(url=URL): df = pd.read_csv(url, header=None) df.columns = ['score', 'usrid', 'timestamp', 'no_query', 'usr', 'tweet'] return df[['score', 'timestamp', 'tweet']].sample(frac=0.001) def clean_tweet(s): ''' :s : string; a tweet :return : list; words that don't contain url, @somebody, and in utf-8 and lower case ''' extra_patterns = ['date', 'time', 'url', 'at']
import pandas as pd import numpy as np import importHelper import loader lm = importHelper.load('lm') class lm_cook_processer(): """ process data into right format for classification. stop words are considered. items in each recipe is 'cleaned' by extracting the last word of each item. For example, "soy milk" will be "milk" after processing, which can be chosen whether to applied or not by tiny change. """ stop_words = [] def __init__(self, df, test=False): self.df = df self.test = test def get_stop_words(self, s, n=8): ''' (u'salt', 23743), (u'pepper', 23569), (u'oil', 22824), (u'sauce', 12822), (u'onions', 12341), (u'sugar', 12340), (u'cheese', 10837), (u'water', 9572), (u'garlic', 9555) ''' from collections import Counter l = [] s.map(lambda x: l.extend(x)) return [x[0] for x in Counter(l).most_common(n)] def union_list(self): self.df.loc[:,'ingredients'] = self.df.loc[:,'ingredients'].map(self._wrap_last_word) if not self.test: lm_cook_processer.stop_words = self.get_stop_words(self.df['ingredients'])
import importHelper import pandas as pd import numpy as np import string from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import re ev = importHelper.load('everything') cv = importHelper.load('cvMachine') proc = importHelper.load('textpreprocessor') URL = 'traindata.csv' def load_data(url=URL): df = pd.read_csv(url, header=None) df.columns = ['score','usrid','timestamp','no_query', 'usr', 'tweet'] return df[['score','timestamp','tweet']].sample(frac=0.001) def clean_tweet(s): ''' :s : string; a tweet :return : list; words that don't contain url, @somebody, and in utf-8 and lower case ''' extra_patterns=['date','time','url','at'] # pattern_at = re.compile(r'@\w+') # pattern_url = re.compile(r'^https?:\/\/.*[\r\n]*')
import pandas as pd import numpy as np import importHelper ent = importHelper.load("entropy") def load_data(): return pd.read_csv('./data/train1.csv', index_col=0, parse_dates=[1, 12, 13]) def people_type(people): """ :rtype: 1: single adult: child = 0, adult = 1 2: couples: child = 0, adult = 2 3: families: child > 0, adult > 0 4: friends: child = 0, adult > 2 5: others """ child, adult = people[0], people[1] if child == 0 and adult == 1: return 1 if child == 0 and adult == 2: return 2 if child > 0 and adult > 0: return 3 if child == 0 and adult > 2: return 4 return 5 def trip_type(days):
import pandas as pd import numpy as np import matplotlib.pyplot as plt import importHelper from sklearn.ensemble import RandomForestClassifier cv = importHelper.load('cvMachine') proc = importHelper.load('preprocessor') trainUrl = './data/train.csv' testUrl = './data/test.csv' nafrac = 0.6 corthr = 0.6 def load_data(url): return pd.read_csv(url, index_col=0) def get_correlated_cols(df, thr): cor = np.corrcoef(df.T) np.fill_diagonal(cor,0) cor_indice = np.where(cor>thr) cor_indice = [(i,j) for i, j in zip(cor_indice[0], cor_indice[1])] cor_cols = [] for (i,j) in cor_indice: if (j,i) not in cor_cols: cor_cols.append((df.columns[i], df.columns[j])) return cor_cols
# statistical language model. dirichlet and jelinek mercer discount methods are applied import importHelper import pandas as pd import random everything = importHelper.load('everything') static_vars = everything.static_vars def jelinek_mercer(ct, p_ml,p_ref,a=0.1): from math import log log_p_s = (p_ml*(1-a)+p_ref.loc[p_ml.index]*a).map(log) return log_p_s def dirichlet(ct, p_ml,p_ref,a=0.1): from math import log d = len(p_ml) u = a / (1+a)*d log_p_s = ((ct+u*p_ref.loc[ct.index])/(d+u)).map(log) return log_p_s def lm(df, x_name, y_name, a=0.1, smooth_method=jelinek_mercer): ''' df: DataFrame containing features and category. Features are actually a list of words, standing for the document. x_name: column name of features y_name: column name of the category a: discount parameter; should be tuned via cross validation smooth_method: method selected to discount the probabilities out: language model '''