Пример #1
0
	def eval_classifer(self, individual):
		# self.counter += 1
		# if self.counter % 100 == 0: print self.counter / 4000

		import importHelper
		cv = importHelper.load('cvMachine')

		train = self.X
		cols_inx = [i for i in range(len(individual)) if individual[i] == 1]
		train = train[cols_inx]
		# print cols_inx
		# print train.head()

		# print len(train.columns)
		score = cv.sklearn_cross_validation(self.clf, train, self.Y).mean()
		# print score
		# print sum(individual)
		return score,
Пример #2
0
def lm_proc(df):
    lm = importHelper.load('lm')

    def clean_data_lm(df):
        def get_xy(df):
            return df.tweet, df.score

        df = clean_data(df)
        weekends = ev.Contain(['Fri', 'Sat', 'Sun', 'Mon'])
        dfs = [df[df['weekday'] != weekends], df[df['weekday'] == weekends]]

        return [get_xy(df) for df in dfs]

    def load_lms(params):
        import itertools
        keys = params.keys()
        params_grid = itertools.product(*(params.values()))
        return [lm.lm(**dict(zip(keys, par))) for par in params_grid]

    dfs = clean_data_lm(df)

    params = {
        'a': [0.05, 0.1, 0.15, 0.2, 0.25],
        'smooth_method': ['jelinek_mercer', 'dirichlet']
    }

    models_on_dfs = [load_lms(params)] * len(dfs)
    # models = [m.fit(X, Y) for m, (X, Y) in zip(models, dfs)]

    scores = {}
    for models_on_df, (X, Y) in zip(models_on_dfs, dfs):
        for model in models_on_df:
            params = model.get_params()
            # print params
            avg_score = cv.cross_validation(model, X, Y, avg=True)
            if params not in scores:
                scores[params] = [avg_score]
            else:
                scores[params].append(avg_score)
    return scores
Пример #3
0
def lm_proc(df):
	lm = importHelper.load('lm')

	def clean_data_lm(df):
		def get_xy(df):
			return df.tweet, df.score

		df = clean_data(df)
		weekends = ev.Contain(['Fri', 'Sat', 'Sun', 'Mon'])
		dfs = [df[df['weekday'] != weekends], df[df['weekday'] == weekends]]

		return [get_xy(df) for df in dfs]

	def load_lms(params):
		import itertools
		keys = params.keys()
		params_grid = itertools.product(*(params.values()))
		return [lm.lm(**dict(zip(keys,par))) for par in params_grid]

	dfs = clean_data_lm(df)

	params = {'a': [0.05, 0.1, 0.15, 0.2, 0.25],
			'smooth_method': ['jelinek_mercer','dirichlet']
			}

	models_on_dfs = [load_lms(params)]*len(dfs)
	# models = [m.fit(X, Y) for m, (X, Y) in zip(models, dfs)]

	scores = {}
	for models_on_df, (X, Y) in zip(models_on_dfs, dfs):
		for model in models_on_df:
			params = model.get_params()
			# print params
			avg_score = cv.cross_validation(model, X, Y, avg=True)
			if params not in scores:
				scores[params] = [avg_score]
			else: scores[params].append(avg_score)
	return scores
Пример #4
0
import matplotlib.pyplot as plt

import pandas as pd

import importHelper

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import re

pd.options.mode.chained_assignment = None

lm = importHelper.load('lm')
proc = importHelper.load('textpreprocessor')
cv = importHelper.load('cvMachine')

URL = './input/Tweets.csv'	
def load_data(url=URL):
	return pd.read_csv(url)

def clean_tweet(s):
	'''
	:s : string; a tweet

	:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
	'''
	extra_patterns=['date','time','url','at']
Пример #5
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

import os
import warnings

import importHelper

from scipy.stats import itemfreq
from sklearn.ensemble import RandomForestClassifier

cor = importHelper.load("cor")
cv = importHelper.load("cvMachine")

PATH = '/media/quan/Q/github/expedia-hotel-recommendations/data/'

def load_data(sqlContext,url):
	df = sqlContext.read.load(url,
							format="com.databricks.spark.csv",
							header='true',
							inferSchema='true')
	return df

def sp_histogram(df, col, ax):
	counts = df.groupby(col).count().sort(col).toPandas()
	ax.bar(left=range(1,counts.shape[0]+1), height=counts['count'], tick_label=counts[col])
Пример #6
0
import numpy as np
from sklearn.datasets import load_digits, load_boston
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale

import importHelper
cv = importHelper.load('cvMachine')

import prep

def classification_test(clf, classes):
	X, y = get_classification_data(classes)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)
	# print y_pred
	# print y_test
	print sum([1 for i in range(y_test.shape[0]) if y_pred[i] == y_test[i]]) / float(y_test.shape[0])
	print cv.confusion_matrix_from_cv(clf, X, y, cv=5)


def regression_test(clf):
	X, y = get_regression_data()
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30)

	clf.fit(X_train, y_train)
	print np.sum((clf.predict(X_test)-y_test)**2)


def get_classification_data(classes):
Пример #7
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

from collections import Counter

import ga
import wrapper

import math

trainURL = '../train.csv'
testURL = '../test.csv'

cv = importHelper.load('cvMachine')
proc = importHelper.load('preprocessor')

def load_data(url):
	df = pd.read_csv(url, index_col=0)
	return df

def clean_data(df, test=False):
	# remove features with almost zero variance plus 'Wilderness_Area4' and 'Soil_Type40' in case of Multicollinearity
	# remove_list = proc.small_var(df,0.002)
	# remove_list.extend(['Wilderness_Area3', 'Soil_Type10','Hillshade_9am','Hillshade_Noon','Hillshade_3pm'])
	remove_list = ['Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type15', 'Soil_Type21', 'Soil_Type25', 'Soil_Type27', 'Soil_Type28', 'Soil_Type34', 'Soil_Type36', 'Wilderness_Area3', 'Soil_Type10','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']
	df = df.drop(labels = remove_list, axis=1)
	# df = df.astype(np.float32)
	if not test: df = proc.shuffle(df)
	return df
Пример #8
0
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

import importHelper
cv = importHelper.load("cvMachine")


def main():
    df1 = pd.read_csv('./data/train1_booking.csv', index_col=0)
    # df2 = pd.read_csv('./data/train1_unbooking.csv', index_col=0)

    X1, y1 = df1[[e for e in df1.columns
                  if e != 'hotel_cluster']], df1['hotel_cluster']
    # X2, y2 = df2[[e for e in df2.columns if e != 'hotel_cluster']], df2['hotel_cluster']
    rf = RandomForestClassifier()
    print cv.sklearn_cross_validation(rf, X1, y1)

    # print cv.sklearn_cross_validation(rf, X2, y2)


if __name__ == '__main__':
    main()
Пример #9
0
import pandas as pd
import numpy as np
import importHelper
proc = importHelper.load('preprocessor')

from sklearn.linear_model import LogisticRegression

TESTDATAURL = "test.csv"
TRAINDATAURL = "train.csv"
use_cols = ["Dates","Category","PdDistrict","X","Y","Address"]
use_cols2 = ["Id","Dates","PdDistrict","X","Y","Address"]

def load_data(test = False):
	if test:
		df = pd.read_csv(TESTDATAURL,
					usecols = use_cols2,
					parse_dates=["Dates"],
					index_col = 'Id')
	else:
		df = pd.read_csv(TRAINDATAURL,
				usecols=use_cols, 
				parse_dates=["Dates"])
	return df

class sf_preprocessor(proc.preprocessor):
	"""
	process data into right format for classification. Outliers are deleted/averaged; features are vectorized; several features are extracted from dates; address is cleaned to determine whether block is in it. 
	"""
	
	def __init__(self, df, test=False, yName = None):
		super(sf_preprocessor, self).__init__(df,test,y_name)
Пример #10
0
import pandas as pd
import importHelper

import loader

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

proc = importHelper.load('preprocessor')
# cross_validation = importHelper.load('cvMachine')


class tfidf_cook_processer(proc.preprocessor):
	"""
	process data into right format for classification. tf*idf is applied to transform the data and stop words are considered. 
	items in each recipe is 'cleaned' by extracting the last word of each item. For example, "soy milk" will be "milk" after processing, which can be chosen whether to applied or not by tiny change. 

	the class is inherited by preprocessor in preprocessor.py, see yq911122/module on Github.

	"""

	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	cv = CountVectorizer()
	tfidf = TfidfTransformer()
	stop_words = []

	def __init__(self, df, test=False, y_name = 'cuisine'):
		super(tfidf_cook_processer, self).__init__(df,test,y_name)
		
Пример #11
0
# statistical language model. dirichlet and jelinek mercer discount methods are applied

import importHelper
import pandas as pd
import random

everything = importHelper.load('everything')
static_vars = everything.static_vars

def jelinek_mercer(ct, p_ml,p_ref,a=0.1):
	from math import log
	log_p_s = (p_ml*(1-a)+p_ref.loc[p_ml.index]*a).map(log)
	return log_p_s

def dirichlet(ct, p_ml,p_ref,a=0.1):
	from math import log
	d = len(p_ml)
	u = a / (1+a)*d
	log_p_s = ((ct+u*p_ref.loc[ct.index])/(d+u)).map(log)
	return log_p_s

def fit(df, x_name, y_name, a=0.1, smooth_method=jelinek_mercer):
	'''
	df: DataFrame containing features and category. Features are actually a list of words, standing for the document.
	x_name: column name of features
	y_name: column name of the category
	a: discount parameter; should be tuned via cross validation
	smooth_method: method selected to discount the probabilities

	out: language model
	'''
Пример #12
0
import pandas as pd
import numpy as np

from math import log

import importHelper

entropy = importHelper.load("entropy")
from entropy import ent, prob

def slice_ent(s):
	counts = np.bincount(s)
	vals = np.true_divide(counts, s.shape[0])
	return ent(vals), np.sum(vals != 0)

# @profile
def ent_eval(s, t):
	"""
	s: data set, pd.Series of a column as label
	t: cut point, int as index

	return: evaluate value
	"""
	size = float(s.shape[0])
	assert(0 <= t < size),"invalid cut point"
	s1, s2 = s[:t], s[t+1:]

	return (s1.shape[0]*slice_ent(s1)[0] + s2.shape[0]*slice_ent(s2)[0])/size
	# return s1.shape[0]/size*ent(s1)+s2.shape[0]/size*ent(s2)

Пример #13
0
import pandas as pd
import numpy as np

import importHelper

dt = importHelper.load("discretizer")
fs = importHelper.load("featureSelector")
entropy = importHelper.load("entropy")
proc = importHelper.load("preprocessor")

def symmetricalUncertainty(x, y):
	return 2*entropy.infoGain(x, y)/(entropy.ent(x)+entropy.ent(y))

def process(df, discrete_cols, continus_cols, y_col):
	x_cols = discrete_cols + continus_cols
	cuts = dt.EntropyDiscretize(df[continus_cols].values, df[y_col].astype(np.int32).values)
	print cuts
	# df[continus_cols] = proc.discretize_df(df, continus_cols, cut)
	# fs = fs.corrSelector(df[x_cols], df[y_col], symmetricalUncertainty, 0.0)
	# print fs.process()


def main():
	df1 = pd.read_csv('./data/train1_booking.csv', index_col=0)
	df2 = pd.read_csv('./data/train1_unbooking.csv', index_col=0)
	continus_cols2 = [u'orig_destination_distance', u'cnt']
	discrete_cols2 = [u'site_name', u'posa_continent', u'user_location_country',u'user_location_region', u'user_location_city',u'user_id', u'is_mobile', u'is_package', u'channel',u'srch_destination_id', u'srch_destination_type_id', u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt',  u'hotel_continent', u'hotel_country', u'hotel_market', u'plan_hour']

	continus_extra, discrete_extra = [u'plan_days', u'travel_days'], [u'travel_month']
	continus_cols1 = continus_cols2 + continus_extra
	discrete_cols1 = discrete_cols2 + discrete_extra
Пример #14
0
import importHelper
import pandas as pd
import numpy as np

import string

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import re

ev = importHelper.load('everything')
cv = importHelper.load('cvMachine')
proc = importHelper.load('textpreprocessor')

URL = 'traindata.csv'


def load_data(url=URL):
    df = pd.read_csv(url, header=None)
    df.columns = ['score', 'usrid', 'timestamp', 'no_query', 'usr', 'tweet']
    return df[['score', 'timestamp', 'tweet']].sample(frac=0.001)


def clean_tweet(s):
    '''
	:s : string; a tweet

	:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
	'''
    extra_patterns = ['date', 'time', 'url', 'at']
Пример #15
0
import pandas as pd
import numpy as np

import importHelper

import loader

lm = importHelper.load('lm')

class lm_cook_processer():
	"""
	process data into right format for classification. stop words are considered. items in each recipe is 'cleaned' by extracting the last word of each item. For example, "soy milk" will be "milk" after processing, which can be chosen whether to applied or not by tiny change. 
	"""
	stop_words = []
	
	def __init__(self, df, test=False):
		self.df = df
		self.test = test

	def get_stop_words(self, s, n=8):
		'''
		(u'salt', 23743), (u'pepper', 23569), (u'oil', 22824), (u'sauce', 12822), (u'onions', 12341), (u'sugar', 12340), (u'cheese', 10837), (u'water', 9572), (u'garlic', 9555)
		'''
		from collections import Counter
		l = []
		s.map(lambda x: l.extend(x))
		return [x[0] for x in Counter(l).most_common(n)]

	def union_list(self):
		self.df.loc[:,'ingredients'] = self.df.loc[:,'ingredients'].map(self._wrap_last_word)
		if not self.test: lm_cook_processer.stop_words = self.get_stop_words(self.df['ingredients'])
Пример #16
0
import importHelper
import pandas as pd
import numpy as np

import string

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import re

ev = importHelper.load('everything')
cv = importHelper.load('cvMachine')
proc = importHelper.load('textpreprocessor')

URL = 'traindata.csv'

def load_data(url=URL):
	df = pd.read_csv(url, header=None)
	df.columns = ['score','usrid','timestamp','no_query', 'usr', 'tweet']
	return df[['score','timestamp','tweet']].sample(frac=0.001)

def clean_tweet(s):
	'''
	:s : string; a tweet

	:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
	'''
	extra_patterns=['date','time','url','at']
	# pattern_at = re.compile(r'@\w+')
	# pattern_url = re.compile(r'^https?:\/\/.*[\r\n]*')
Пример #17
0
import pandas as pd
import numpy as np

import importHelper

import loader

lm = importHelper.load('lm')

class lm_cook_processer():
	"""
	process data into right format for classification. stop words are considered. items in each recipe is 'cleaned' by extracting the last word of each item. For example, "soy milk" will be "milk" after processing, which can be chosen whether to applied or not by tiny change. 
	"""
	stop_words = []
	
	def __init__(self, df, test=False):
		self.df = df
		self.test = test

	def get_stop_words(self, s, n=8):
		'''
		(u'salt', 23743), (u'pepper', 23569), (u'oil', 22824), (u'sauce', 12822), (u'onions', 12341), (u'sugar', 12340), (u'cheese', 10837), (u'water', 9572), (u'garlic', 9555)
		'''
		from collections import Counter
		l = []
		s.map(lambda x: l.extend(x))
		return [x[0] for x in Counter(l).most_common(n)]

	def union_list(self):
		self.df.loc[:,'ingredients'] = self.df.loc[:,'ingredients'].map(self._wrap_last_word)
		if not self.test: lm_cook_processer.stop_words = self.get_stop_words(self.df['ingredients'])
Пример #18
0
import pandas as pd
import numpy as np

import importHelper

ent = importHelper.load("entropy")


def load_data():
    return pd.read_csv('./data/train1.csv',
                       index_col=0,
                       parse_dates=[1, 12, 13])


def people_type(people):
    """
	:rtype: 1: single adult: child = 0, adult = 1
			2: couples: child = 0, adult = 2
			3: families: child > 0, adult > 0
			4: friends: child = 0, adult > 2
			5: others
	"""
    child, adult = people[0], people[1]
    if child == 0 and adult == 1: return 1
    if child == 0 and adult == 2: return 2
    if child > 0 and adult > 0: return 3
    if child == 0 and adult > 2: return 4
    return 5


def trip_type(days):
Пример #19
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import importHelper

from sklearn.ensemble import RandomForestClassifier

cv = importHelper.load('cvMachine')
proc = importHelper.load('preprocessor')

trainUrl = './data/train.csv'
testUrl = './data/test.csv'

nafrac = 0.6
corthr = 0.6

def load_data(url):
	return pd.read_csv(url, index_col=0)

def get_correlated_cols(df, thr):
	cor = np.corrcoef(df.T)
	np.fill_diagonal(cor,0)
	cor_indice = np.where(cor>thr)
	cor_indice = [(i,j) for i, j in zip(cor_indice[0], cor_indice[1])]
	cor_cols = []
	for (i,j) in cor_indice:
		if (j,i) not in cor_cols:
			cor_cols.append((df.columns[i], df.columns[j]))
	return cor_cols
	 
Пример #20
0
# statistical language model. dirichlet and jelinek mercer discount methods are applied

import importHelper
import pandas as pd
import random

everything = importHelper.load('everything')
static_vars = everything.static_vars

def jelinek_mercer(ct, p_ml,p_ref,a=0.1):
	from math import log
	log_p_s = (p_ml*(1-a)+p_ref.loc[p_ml.index]*a).map(log)
	return log_p_s

def dirichlet(ct, p_ml,p_ref,a=0.1):
	from math import log
	d = len(p_ml)
	u = a / (1+a)*d
	log_p_s = ((ct+u*p_ref.loc[ct.index])/(d+u)).map(log)
	return log_p_s

def lm(df, x_name, y_name, a=0.1, smooth_method=jelinek_mercer):
	'''
	df: DataFrame containing features and category. Features are actually a list of words, standing for the document.
	x_name: column name of features
	y_name: column name of the category
	a: discount parameter; should be tuned via cross validation
	smooth_method: method selected to discount the probabilities

	out: language model
	'''