def run(self): """ Trains the model on the training set and returns predictions for the test set. """ print "Initializing model..." self.model = Model(self.train, self.test, self.users, self.purchases, self.visits) print "Training model..." self.model.run() print "Making predictions..." return self.model.predict()
class Validator(object): def __init__(self, start_date, training_period, validation_period, loader): """ :param start_date: string for start date for training period (format "%Y-%m-%d") :param training_period: length of training period in days :param validation_period: length of validation period in days :param loader: DataLoader """ print "Creating training and test sets..." self.start = np.datetime64(start_date) self.end = self.start + training_period self.valid_end = self.end + validation_period assert isinstance(loader, DataLoader) display = loader.coupons_train.DISPFROM # use coupons in training period for training self.train = loader.coupons_train.copy(deep=True) self.train.drop(loader.coupons_train[display >= self.end].index, inplace=True) self.train.drop(loader.coupons_train[display < self.start].index, inplace=True) # use coupons in validation period for testing self.test = loader.coupons_train.copy(deep=True) self.test.drop(loader.coupons_train[display >= self.valid_end].index, inplace=True) self.test.drop(loader.coupons_train[display < self.end].index, inplace=True) print "Splitting user transactions..." trans = loader.details_train # only allow transaction history of coupons in the training set within training period self.purchases = trans[trans.COUPON_ID_hash.isin(self.train.COUPON_ID_hash)].copy(deep=True) self.purchases.drop(self.purchases[self.purchases.I_DATE >= self.end].index, inplace=True) self.purchases.drop(self.purchases[self.purchases.I_DATE < self.start].index, inplace=True) # actual purchases made during validation period tp = trans.copy(deep=True) tp.drop(trans[trans.I_DATE >= self.valid_end].index, inplace=True) tp.drop(trans[trans.I_DATE < self.end].index, inplace=True) tp.drop(tp[tp.COUPON_ID_hash.isin(self.train.COUPON_ID_hash)].index, inplace=True) self.test_purchases = tp # check that all test purchases come from test coupon data assert tp[tp.COUPON_ID_hash.isin(self.test.COUPON_ID_hash)].shape == tp.shape # user visits self.visits = loader.visits # users self.users = loader.user_list print "Getting actual purchases..." # actual purchases/recommendations in Kaggle submission format self.actual = self._actual_purchases() self.model = None def run(self): """ Trains the model on the training set and returns predictions for the test set. """ print "Initializing model..." self.model = Model(self.train, self.test, self.users, self.purchases, self.visits) print "Training model..." self.model.run() print "Making predictions..." return self.model.predict() def _actual_purchases(self): """ Returns a pandas.DataFrame of the actual coupon purchases during the validation period. Format of the DataFrame is identical to the kaggle submission format. """ results = [] tp = self.test_purchases for index in self.users.index: user_id = self.users.ix[index].USER_ID_hash user_purchases = tp[tp.USER_ID_hash == user_id].sort(["I_DATE", "ITEM_COUNT"], ascending=[True, False], axis=0) coups = user_purchases.COUPON_ID_hash.tolist() ids = "" for value in coups: ids += value + " " ids = ids.strip() results.append([user_id, ids]) return pd.DataFrame(results, columns=["USER_ID_hash", "PURCHASED_COUPONS"]) def mapk(self, k, actual, predicted): """ :param k: max length of predicted sequence :param actual: DataFrame of actual purchases for each user (kaggle format) :param predicted: DataFrame of predicted purchases for each user (kaggle format) :return: Mean Average Precision at k See https://github.com/benhamner/Metrics/blob/master/R/R/metrics.r """ print "Computing MAP score..." scores = [] users = [] for i, j in zip(actual.index, predicted.index): assert actual.ix[i].USER_ID_hash == predicted.ix[i].USER_ID_hash a = actual.ix[i].PURCHASED_COUPONS p = predicted.ix[j].PURCHASED_COUPONS scores.append(self.apk(k, a, p)) users.append(actual.ix[i].USER_ID_hash) return np.array(scores).mean(), pd.DataFrame([users,scores]).transpose() @staticmethod def apk(k, actual, predicted): """ :param k: max length of predicted sequence :param actual: actual Coupon hash tags as a list :param predicted: list of predicted Coupon hash tags :return: Average Precision at k """ actual = actual.split(' ') predicted = predicted.split(' ') score = 0.0 cnt = 0.0 for i in range(min(k, len(predicted))): if predicted[i] in actual: if predicted[i] not in predicted[0:i]: cnt += 1 score += cnt/(i+1) return score / min(len(actual),k)