def runFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2
def runFM(train_X, train_y, test_X, test_y, test_X2, params): params['D'] = train_X.shape[1] rounds = params.pop('rounds') model = FM_FTRL(**params) print_step('Fit FM') for i in range(rounds): model.fit(train_X, train_y, reset=False) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format(i + 1, rounds, rmse(pred_test_y, test_y))) print_step('FM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
class vanila_FM_FTRL_Regressor: def __init__(self, param_dict, D): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] alpha_fm = param_dict['alpha_fm'] init_fm = param_dict['init_fm'] D_fm = param_dict['D_fm'] e_noise = param_dict['e_noise'] iters = param_dict['iters'] self.model = FM_FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=init_fm, D_fm=D_fm, e_noise=e_noise, iters=iters, inv_link="identity", threads=THREAD) def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_test): return self.model.predict(X_test)
def trainFMFTRL(): # Load Data dftrain, X_train, y_train, moddict = prepSparseTrain({}, trn_file, trnidx) dfvalid, X_valid = prepSparseTest(moddict, trn_file, validx) # Train the model modelfm = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. threshold = .0002 for i in range(15): modelfm.fit(X_train , dftrain.target.values, verbose=1) predsfm = modelfm.predict(X=X_valid) score_ = rmsle(np.expm1(dfvalid.target.values), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ + threshold < baseline: baseline = score_ else: break # 0.42919 with zeros in val # 0.42160 removing zeros in val # X_train.shape = (1333501, 1902850) # ('FM_FTRL dev RMSLE:', 0.42850571762280409) # Reduce the number of columns keep_cols = ['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name', 'price', \ 'shipping', 'item_description', 'target', 'general_cat', 'subcat_1', 'subcat_2'] dftrain, dfvalid = dftrain[keep_cols], dfvalid[keep_cols] return dftrain, dfvalid, y_train, moddict, modelfm, predsfm
def make_fmftrl_predictions(X_train, X_test, y): model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link='identity', threads=4) model.fit(X_train, y, verbose=1) y_pred = model.predict(X_test) return y_pred
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ''' ix = (merge['brand_name']==merge['brand_name']) & \ (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix] ''' #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: wb = CountVectorizer() if 'X_orig' not in locals(): X_orig = wb.fit_transform(merge[col]) else: X_orig = hstack((X_orig, wb.fit_transform(merge[col]))) print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' ''' Stemmer ''' # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py from nltk.tokenize import WordPunctTokenizer # This is better for sentences containing unicode, like: u"N\u00faria Espert" word_tokenize = WordPunctTokenizer().tokenize import Stemmer import string ps = Stemmer.Stemmer("english") _wsre = re.compile("\s+") _alphanumre = re.compile("[\w\-\' ]", re.UNICODE) def _removestopwords(txtwords): global stoplist # stoplist = stopwords.words("english") if stoplist is None: stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()]) return [[w for w in t if w not in stoplist] for t in txtwords] def _stem(txtwords): return [stemmer.stemWords(t) for t in txtwords] def _removenonalphanumericchars(txtwords): return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords] def _stripallwhitespace(txts): return [_wsre.sub("", txt) for txt in txts] stemmer = Stemmer.Stemmer("english") def textpreprocess(txt, sentencetokenize=False, replacehyphenbyspace=True, wordtokenize=False, lowercase=True, stem=True, removenonalphanumericchars=True, stripallwhitespace=True): """ Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean) Note: One could improve the sentence tokenization, by using the original HTML formatting in the tokenization. Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild the PorterStemmer object each time this function is called.) """ if sentencetokenize: txts = nltk.word_tokenize(txt) #txts = tokenizer.tokenize(txt.split()) else: txts = txt.split() txt = None if replacehyphenbyspace: txts = [t.replace("-", " ") for t in txts] if wordtokenize: txtwords = [word_tokenize(t) for t in txts] else: txtwords = [string.split(t) for t in txts] txts = None if lowercase: txtwords = [[string.lower(w) for w in t] for t in txtwords] if stem: txtwords = _stem(txtwords) # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string if removenonalphanumericchars: txtwords = _removenonalphanumericchars(txtwords) txtwords = [[w for w in t if w != ""] for t in txtwords] txts = [string.join(words) for words in txtwords] if stripallwhitespace: for _ in range(2): txts = _stripallwhitespace(txts) return string.join(txts, sep=" ") print('[{}] Start stemming'.format(time.time() - start_time)) merge['stem_name'] = [textpreprocess(s) for s in merge["name"].values] print('[{}] Stemming completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2'] ) crossed_columns_d = cross_columns(x_cols) categorical_columns = list( merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print ('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash)))%D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del(lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del(wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_stem_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx] model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X , train_y , verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
class vanila_FM_FTRL_Regressor: def __init__(self, param_dict, feature_dim): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] alpha_fm = param_dict['alpha_fm'] init_fm = param_dict['init_fm'] D_fm = param_dict['D_fm'] e_noise = param_dict['e_noise'] self.iters = param_dict['iters'] self.model = FM_FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=feature_dim, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=init_fm, D_fm=D_fm, e_noise=e_noise, iters=self.iters, inv_link="identity", threads=6) def fit(self, X_train, y_train, X_valid=0, y_valid=0): # for i in range(self.iters-1): # self.model.fit(X_train, y_train) # y_pred=self.model.predict(X_valid) # loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, y_pred * PRICE_STD) # print('iter: %d, loss: %f'%(i,loss)) # slice_len=X_train.shape[0]//4 # weight=0.7 # y_pred=np.zeros(X_valid.shape[0]) # for i in range(4): # start=i*slice_len # rear=(i+1)*slice_len if i!=3 else X_train.shape[0] # self.model.fit(X_train[start:rear], y_train[start:rear]) # tmp=self.model.predict(X_valid) # loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, tmp * PRICE_STD) # print('iter: %d, loss: %f'%(i,loss)) # y_pred=y_pred+tmp*weight # weight+=0.2 # y_pred/=4 # loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, y_pred * PRICE_STD) # print('loss: %f'%(loss)) self.model.fit(X_train, y_train) #y_pred = self.model.predict(X_valid) #loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, y_pred * PRICE_STD) #print('loss: %f'%(loss)) #self.model2.fit(X_train, y_train) #y_pred2 = self.model2.predict(X_valid) #loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, y_pred2 * PRICE_STD) #print('loss: %f'%(loss)) #loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, (y_pred+y_pred2)/2 * PRICE_STD) #print('loss: %f'%(loss)) #loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, (y_pred*0.6+y_pred2*0.4) * PRICE_STD) #print('loss: %f'%(loss)) #loss=compute_loss(y_valid.reshape(-1) * PRICE_STD, (y_pred*0.7+y_pred2*0.3) * PRICE_STD) #print('loss: %f'%(loss)) #_save('Data/FM_ensemble',[y_valid,y_pred,y_pred2]) def predict(self, X_valid): return self.model.predict(X_valid)
print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y fm_ftrl = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) fm_ftrl.fit(train_X, train_y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) predsFm_ftrl = fm_ftrl.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) # ridge = lm.Ridge(solver='auto',fit_intercept=True,alpha=1,max_iter=500,normalize=False,tol=0.05).fit(X=X,y=y) #0.4647 # ridge.fit(train_X, train_y) # print('[{}] Train Ridge completed'.format(time.time() - start_time)) # predsRidge = ridge.predict(X_test) # print('[{}] Predict Ridge completed'.format(time.time() - start_time)) # ftrl = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) # ftrl.fit(train_X, train_y) # print('[{}] Train FTRL completed'.format(time.time() - start_time)) # predsFTRL = ftrl.predict(X_test) # print('[{}] Predict FTRL completed'.format(time.time() - start_time))
del (X_dummies) # del(X_description) del (X_description1) del (X_description2) del (X_description3) print(train_X.shape) print(valid_X.shape) print('[{}] addition feature completed.'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01, D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4, weight_fm=1.0) for i in range(4): model.fit(train_X, train_y) if debug: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) param = model.__getstate__() model.__setstate__((param[0], param[1], param[2], param[3], param[4] * 0.8, param[5], param[6], param[7], param[8], param[9], param[10], param[11], param[12], param[13], param[14], param[15], param[16], param[17] , param[18], param[19])) if debug: resdefm = preds resf = model.predict(X=X_test) res2.extend(resf) del (param) del (model) del (sparse_merge) del (train_X) del (train_y)
init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) clf.fit(train_features[trn_idx], train_target[trn_idx], train_weight[trn_idx], reset=False) class_pred[val_idx] = sigmoid( clf.predict(train_features[val_idx])) score = roc_auc_score(train_target[val_idx], class_pred[val_idx]) cv_scores.append(score) losses_per_folds[n_fold] += score / len(class_names) submission[class_name] += sigmoid( clf.predict(test_features)) / folds.n_splits #Classifier chain. Order of classes not optimized train_features = csr_matrix( hstack([ train_features, np.reshape( np.array([0 if x < 0.5 else 1 for x in class_pred]), (train.shape[0], 1)) ]))
init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) clf.fit(train_features[ind_trn], train_target[ind_trn], train_weight[ind_trn], reset=False) class_pred[ind_val] = sigmoid( clf.predict(train_features[ind_val])) score = roc_auc_score(train_target[ind_val], class_pred[ind_val]) cv_scores.append(score) losses_per_folds[n_fold] += score / len(class_names) submission[class_name] += sigmoid( clf.predict(test_features)) / folds.n_splits #Classifier chain. Order of classes not optimized train_features = csr_matrix( hstack([ train_features, np.reshape( np.array([0 if x < 0.5 else 1 for x in class_pred]), (train.shape[0], 1)) ]))
del(X_dummies) #del(X_description) del(X_description1) del(X_description2) del(X_description3) print(train_X.shape) print(valid_X.shape) print('[{}] addition feature completed.'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.07, L2_fm=0.001, init_fm=0.01, D_fm=400, e_noise=0.0001, iters=1, inv_link="identity", threads=4,weight_fm = 1.0) for i in range(4): model.fit(train_X, train_y) if debug: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) param = model.__getstate__() model.__setstate__((param[0],param[1],param[2],param[3],param[4] * 0.8,param[5],param[6],param[7], param[8],param[9],param[10],param[11],param[12],param[13],param[14],param[15],param[16],param[17] , param[18],param[19])) if debug: resdefm = preds resf = model.predict(X=X_test) res2.extend(resf) del(param) del(model) del(sparse_merge) del(train_X) del(train_y) del(valid_X)
] ).tocsr() del test_word_features, tst_str_array, test_char_features gc.collect() print("Fin Stack test") with timer("Scoring FM FTRL"): clf = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=5, inv_link="identity", e_clip=1.0, threads=1, use_avx=1, verbose=1 ) clf.fit(train_features, labels) train_pred = clf.predict(train_features) pred = clf.predict(test_features) score = sqrt(mean_squared_error(labels, train_pred)) print("FINAL RMSE {}".format(score)) sub = pd.read_csv('../input/sample_submission.csv') sub['deal_probability'] = pred sub['deal_probability'].clip(0.0, 1.0, inplace=True) print("Output Prediction CSV") sub.to_csv('subm/wordbatch_fmtrl_submission.csv', index=False) ''' FINAL RMSE 0.22547715723618647 [Scoring FM FTRL] done in 692 s FINAL RMSE 0.22721282218651886
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**29, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] mask = np.array(np.clip(X.getnnz(axis=0) - 1, 0, 1), dtype=bool) X = X[:, mask] X_test = X_test[:, mask] print(X.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
fold_id = -1 X = train_features y = labels x_test = test_features val_predict = np.zeros(y.shape) for train_index, val_index in kf.split(X): fold_id += 1 x_train, x_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] model.fit(x_train, y_train) train_pred = model.predict(x_train) y_pred = model.predict(x_val) val_predict[val_index] = y_pred rmse = mean_squared_error(y_val, y_pred)**0.5 print('valid score: {}'.format(rmse)) sub = pd.read_csv('../input/sample_submission.csv') pred = model.predict(x_test) sub['deal_probability'] = pred sub['deal_probability'].clip(0.0, 1.0, inplace=True) print("Output Prediction CSV") sub.to_csv( 'subm/wordbatch_fmtrl_submissionV3_{}.csv'.format(fold_id), index=False) # val_predicts = pd.DataFrame(data=val_predict, columns= labels) # val_predicts['user_id'] = train_user_ids
if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.90, random_state=233) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
preds1 = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) model3 = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model3.fit(X, y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) preds3 = model3.predict(X_test) final_pred = 0.39479745 * preds1 + 0.60691396 * preds3 submission['price'] = np.expm1(final_pred) submission.to_csv("submission_2.csv", index=False) print('[{}] Finished training models...'.format(time.time() - start_time))
beta=0.01, L1=0.00001, L2=0.1, D=train_features.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=3, inv_link="identity", threads=4) clf.fit(train_features[trn_idx], train_target.iloc[trn_idx]) # Compute prediction and use sigmoid class_pred[val_idx] = sigmoid( clf.predict(train_features[val_idx])) score = roc_auc_score(train_target.iloc[val_idx], class_pred[val_idx]) cv_scores.append(score) losses_per_folds[n_fold] += score / len(class_names) # Compute test predictions submission[class_name + "_temp"] = clf.predict(test_features) # Compute mean (without NaN) class_mean = submission[class_name + "_temp"].mean() # Replace NaNs if any submission[class_name + "_temp"].fillna(class_mean, inplace=True) # Transform using sigmoid submission[class_name] += sigmoid( submission[class_name + "_temp"]) / folds.n_splits del submission[class_name + "_temp"]
class FM_FTRL_Model(BaseMlModel): def __init__(self): super().__init__() def train(self, tag): print("FM_FTRL training") train_data, train_labels, dev_data, dev_labels = self.prepare_train_data( tag, 'FM_FTRL') train_data = preprocessing.scale(train_data, with_mean=False) dev_data = preprocessing.scale(dev_data, with_mean=False) self.clf = FM_FTRL( alpha=0.001, # w0和w的FTRL超参数alpha beta=0.01, # w0和w的FTRL超参数beta L1=0.00001, # w0和w的L1正则 L2=0.1, # w0和w的L2正则 D=train_data.shape[1], alpha_fm=0.001, # v的FTRL超参数alpha L2_fm=0.0, # v的L2正则 init_fm=0.01, D_fm=200, e_noise=0.0001, # iters=5, inv_link="identity", threads=7, ) self.clf.fit(train_data, train_labels) y_train = self.clf.predict(train_data) y_val = self.clf.predict(dev_data) print('train_logloss: ' + str(log_loss(train_labels, y_train))) print("val_logloss: " + str(log_loss(dev_labels, y_val))) print("train_auc: " + str(roc_auc_score(train_labels, y_train))) print("val_auc: " + str(roc_auc_score(dev_labels, y_val))) def test(self, name): print("FM_FTRL testing...") train_data, train_labels, test_data = self.prepare_test_data( name, 'FM_FTRL') self.clf = FM_FTRL( alpha=0.01, # w0和w的FTRL超参数alpha beta=0.01, # w0和w的FTRL超参数beta L1=0, # w0和w的L1正则 L2=0, # w0和w的L2正则 D=train_data.shape[1], alpha_fm=0.005, # v的FTRL超参数alpha L2_fm=0.01, # v的L2正则 init_fm=0.01, D_fm=2, e_noise=0.0001, iters=3, inv_link="sigmoid", threads=7, ) self.clf.fit(train_data, train_labels) submit = self.clf.predict(test_data) with open( config.output_prefix_path + 'FM_FTRL_' + name + '-summit.txt', 'w') as fr: for sub in submit: fr.write(str(sub) + '\n')
X_dev = sparse_merge[n_trains:n_trains + n_devs] X_test = sparse_merge[n_trains + n_devs:] Y_train = full_df.iloc[:n_trains].target.values.reshape(-1) Y_dev = full_df[n_trains:n_trains + n_devs].target print_time('Fitting FM_FTRL.') model = FM_FTRL(alpha=0.01, beta=0.01, L1=1, L2=0.001, \ D=sparse_merge.shape[1], alpha_fm=0.018, \ L2_fm=0.0, init_fm=0.01, D_fm=400, e_noise=0.0001, \ iters=25, inv_link="identity", threads=1) model.fit(X_train, Y_train) if SUBMIT: preds_fm_test = model.predict(X=X_test) else: preds_fm_dev = model.predict(X=X_dev) print(rmsle(Y_dev, preds_fm_dev)) del model print_time('FM_FTRL finished') #print_time('Fitting LGB...') #params = { # 'learning_rate': 0.6, # 'application': 'regression', # 'num_leaves': 31, # 'verbosity': -1, # 'metric': 'RMSE', # 'data_random_seed': 1,
class WordBatchModel(object): def __init__(self): self.wb_desc = None self.desc_indices = None self.cv_name, self.cv_name2 = None, None self.cv_cat0, self.cv_cat1, self.cv_cat2 = None, None, None self.cv_brand = None self.cv_condition = None self.cv_cat_brand = None self.desc3 = None self.model = None def train(self, df): self.wb_desc = wordbatch.WordBatch(None, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) self.wb_desc.dictionary_freeze = True X_desc = self.wb_desc.fit_transform(df['item_description']) self.desc_indices = np.array(np.clip(X_desc.getnnz(axis=0) - 1, 0, 1), dtype=bool) X_desc = X_desc[:, self.desc_indices] self.cv_name = CountVectorizer(min_df=2, ngram_range=(1, 1), binary=True, token_pattern="\w+") X_name = 2 * self.cv_name.fit_transform(df['name']) self.cv_name2 = CountVectorizer(min_df=2, ngram_range=(2, 2), binary=True, token_pattern="\w+") X_name2 = 0.5 * self.cv_name2.fit_transform(df['name']) self.cv_cat0 = CountVectorizer(min_df=2) X_category0 = self.cv_cat0.fit_transform(df['subcat_0']) self.cv_cat1 = CountVectorizer(min_df=2) X_category1 = self.cv_cat1.fit_transform(df['subcat_1']) self.cv_cat2 = CountVectorizer(min_df=2) X_category2 = self.cv_cat2.fit_transform(df['subcat_2']) self.cv_brand = CountVectorizer(min_df=2, token_pattern=".+") X_brand = self.cv_brand.fit_transform(df['brand_name']) self.cv_condition = CountVectorizer(token_pattern=".+") X_condition = self.cv_condition.fit_transform( (df['item_condition_id'] + 10 * df["shipping"]).apply(str)) df["cat_brand"] = [ a + " " + b for a, b in zip(df["category_name"], df["brand_name"]) ] self.cv_cat_brand = CountVectorizer(min_df=10, token_pattern=".+") X_cat_brand = self.cv_cat_brand.fit_transform(df["cat_brand"]) self.desc3 = CountVectorizer(ngram_range=(3, 3), max_features=1000, binary=True, token_pattern="\w+") X_desc3 = self.desc3.fit_transform(df["item_description"]) X = hstack( (X_condition, X_desc, X_brand, X_category0, X_category1, X_category2, X_name, X_name2, X_cat_brand, X_desc3)).tocsr() y = df["target"].values self.model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X.shape[1], alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) self.model.fit(X, y) def predict(self, df): X_desc = self.wb_desc.transform(df["item_description"]) X_desc = X_desc[:, self.desc_indices] X_name = 2 * self.cv_name.transform(df["name"]) X_name2 = 0.5 * self.cv_name2.transform(df["name"]) X_category0 = self.cv_cat0.transform(df['subcat_0']) X_category1 = self.cv_cat1.transform(df['subcat_1']) X_category2 = self.cv_cat2.transform(df['subcat_2']) X_brand = self.cv_brand.transform(df['brand_name']) X_condition = self.cv_condition.transform( (df['item_condition_id'] + 10 * df["shipping"]).apply(str)) df["cat_brand"] = [ a + " " + b for a, b in zip(df["category_name"], df["brand_name"]) ] X_cat_brand = self.cv_cat_brand.transform(df["cat_brand"]) X_desc3 = self.desc3.transform(df["item_description"]) X = hstack( (X_condition, X_desc, X_brand, X_category0, X_category1, X_category2, X_name, X_name2, X_cat_brand, X_desc3)).tocsr() return self.model.predict(X)
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Regex characteristics - carat, gb/tb, cpu ''' def count_rgx(regexls, idx_, filter_=None): colvals = merge['name'][idx_] + ' ' + merge['item_description'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series([ int(v[0]) if len(set(v)) == 1 else 0 for v in valsls ])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals def count_rgx_name(regexls, idx_, filter_=None): colvals = merge['name'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series( [int(v[0]) if len(v) != 0 else 0 for v in valsls])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals # gold measures = np.zeros((merge.shape[0], 4)) ix_chk = ((merge.name.str.contains('gold', case=False)) | \ (merge.item_description.str.contains('gold', case=False))) & \ (merge['subcat_1'] == 'Jewelry') rgxls = [ r"(\d+)k ", r"(\d+)kt ", r"(\d+)k.", r"(\d+)kt.", r"(\d+)k,", r"(\d+)kt,", r"(\d+) k ", r"(\d+) kt", r"(\d+) k.", r"(\d+) kt.", r"(\d+) k,", r"(\d+) kt," ] measures[ix_chk, 0] = count_rgx(rgxls, ix_chk, filter_=[10, 12, 14, 16, 18, 20, 21, 22, 23, 24]) # phone memory ix_chk = (merge['subcat_2'] == 'Cell Phones & Smartphones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 1] = count_rgx(rgxls, ix_chk) # console memory ix_chk = (merge['subcat_2'] == 'Consoles') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 2] = count_rgx(rgxls, ix_chk) # computer memory ix_chk = (merge['category_name'] == 'Electronics/Computers & Tablets/Laptops & Netbooks') | \ (merge['category_name'] == 'Electronics/Computers & Tablets/Desktops & All-In-Ones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 3] = count_rgx(rgxls, ix_chk) # cpu # oz # diamond #r"(\d+) karat ", r"(\d+) carat " ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: lb = LabelBinarizer(sparse_output=True) if 'X_orig' not in locals(): X_orig = lb.fit_transform(merge[col].apply(hash)) else: X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash)))) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_memory = lb.fit_transform(merge['measure_memory']) mask = np.array(np.clip(X_memory.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_memory = X_memory[:, mask] X_gold = lb.fit_transform(merge['measure_gold']) mask = np.array(np.clip(X_gold.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_gold = X_gold[:, mask] print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_memory, X_gold) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_memory, X_gold)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 # 0.419741 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
def runChainedFM(train_X, train_y, test_X, test_y, test_X2, label, dev_index, val_index): print_step('Loading Lvl1') lvl1_train, lvl1_test = load_cache('lvl1_fm') [ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ] lvl1_train = csr_matrix( pd.concat([ lvl1_train[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_train.columns if 'fm_' in c and c != label ], axis=1).values) lvl1_test = csr_matrix( pd.concat([ lvl1_test[c].apply(lambda x: 0 if x < 0.5 else 1) for c in lvl1_test.columns if 'fm_' in c and c != label ], axis=1).values) print_step('Merging 1/3') lvl1_valid = lvl1_train[val_index] lvl1_train = lvl1_train[dev_index] train_X = csr_matrix(hstack([train_X, lvl1_train])) print_step('Merging 2/3') test_X = csr_matrix(hstack([test_X, lvl1_valid])) print_step('Merging 3/3') test_X2 = csr_matrix(hstack([test_X2, lvl1_test])) print_step('Modeling') class_weights = { 'toxic': 1.0, 'severe_toxic': 0.2, 'obscene': 1.0, 'threat': 0.1, 'insult': 0.8, 'identity_hate': 0.2 } model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_X.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx=1, verbose=1) train_weight = np.array( [1.0 if x == 1 else class_weights[label] for x in train_y]) model.fit(train_X, train_y, train_weight, reset=False) pred_test_y = sigmoid(model.predict(test_X)) pred_test_y2 = sigmoid(model.predict(test_X2)) return pred_test_y, pred_test_y2
test.loc[~test["category_1"].isin(tmp), "category_1"] = "others" for category in test["category_1"].unique(): tmp = ps.transform(test.loc[test["category_1"] == category]) tmp[features_to_be_scaled_lst] = sc.transform( tmp[features_to_be_scaled_lst]) X_test = ste.transform(tmp) gc.collect() predsF = FTRL_model.predict(X_test) print("predsF shape:", predsF.shape) print("predsF NaNs:", np.isnan(predsF).sum()) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) predsFM = FM_FTRL_model.predict(X_test) print("predsFM shape:", predsFM.shape) print("predsFM NaNs:", np.isnan(predsFM).sum()) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) X_test = X_test[:, mask] predsL = lgb_model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = coeffs_dict[category][0] * predsF + coeffs_dict[category][ 1] * predsFM + coeffs_dict[category][2] * predsL submission["price"].loc[test["category_1"] == category] = np.expm1( preds)
def getFMFTRL(moddict): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() cpuStats() merge = prepFMFeatures(merge) cpuStats() merge.head() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.array(np.clip( X_name[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(cats) X_cat_tmp = moddict['wb_cat'].transform(cats) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(cats.tolist(), range(len(cats))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) moddict['wb_cat_mask'] = np.array(np.clip( X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) moddict['wb_dscr_mask'] = np.array(np.clip( X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) moddict['all_mask'] = np.array(np.clip( sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, moddict['all_mask']] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break # 0.41357 moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = moddict['FMmodel'].predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, moddict, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
def trainFMFTRL(moddict): merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8') mergetst = pd.read_csv(tst_file, sep='\t', encoding='utf-8') #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', merge.shape) dftt = merge[(merge.price < 1.0)] merge = merge.drop(merge[(merge.price < 1.0)].index) del dftt['price'] nrow_train = merge.shape[0] # print(nrow_train, nrow_test) y = np.log1p(merge["price"]) merge = pd.concat([merge, dftt]) merge['target'] = np.log1p(merge["price"]) #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]), random_state=233, train_size=0.90) gc.collect() cpuStats() merge = prepFMFeatures(merge) mergetst = prepFMFeatures(mergetst) cpuStats() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Test Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] for i in range(0, len(cross_nm)): if i == 0: x_coltst = moddict['cross_cols'][cross_nm[i]].transform( mergetst[cross_nm[i]]) else: x_coltst = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( mergetst[cross_nm[i]]))) del mergetst[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.where( X_name[:nrow_train].getnnz(axis=0) > 0)[0] X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' test Hash name ''' X_name = moddict['wb_name'].transform(mergetst['name']) X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset #cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(categories) X_cat_tmp = moddict['wb_cat'].transform(categories) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(categories.tolist(), range(len(categories))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0] X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_dscr_mask'] = np.where( X_description[:nrow_train].getnnz(axis=0) > 1)[0] X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) print(50 * '-') cpuStats() print(50 * '-') # Remove features with document frequency <=1 print(sparse_merge.shape) gc.collect() sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train] if develop: train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \ sparse_merge[validx], \ y.values[trnidx], y.values[validx] del sparse_merge gc.collect() print(50 * '*') cpuStats() print(50 * '*') print(train_X.shape[1]) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=4) #iters=15 print(50 * '|') cpuStats() print(50 * '|') baseline = 1. for i in range(15): print(50 * '-') cpuStats() print(50 * '-') model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) gc.collect() return merge, moddict, trnidx, validx, nrow_train, predsfm
def wordbatch_algo(test): import time print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: # train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') # test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') # test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) # submission: pd.DataFrame = test[['test_id']] ''' # Mean of each group # https://stackoverflow.com/questions/30244952/python-pandas-create-new-column-with-groupby-sum cat_mean = train['price'].groupby(train['category_name']).mean() cat_mean = pd.DataFrame({'category_name':cat_mean.index, 'cat_mean':cat_mean.values}) merge = merge.merge(cat_mean, on=['category_name'], how='left') # print(merge.head()) X_cat_mean = merge['cat_mean'].as_matrix().reshape(-1, 1) # X_cat_mean = normalize(np.nan_to_num(X_cat_mean).reshape(-1, 1), norm='max') cond_mean = train['price'].groupby(train['item_condition_id']).mean() cond_mean = pd.DataFrame({'item_condition_id':cond_mean.index, 'cond_mean':cond_mean.values}) merge = merge.merge(cond_mean, on=['item_condition_id'], how='left') X_cond_mean = merge['cond_mean'].as_matrix().reshape(-1, 1) brand_mean = train['price'].groupby(train['brand_name']).mean() brand_mean = pd.DataFrame({'brand_name':brand_mean.index, 'brand_mean':brand_mean.values}) merge = merge.merge(brand_mean, on=['brand_name'], how='left') X_brand_mean = merge['brand_mean'].as_matrix().reshape(-1, 1) ship_mean = train['price'].groupby(train['shipping']).mean() ship_mean = pd.DataFrame({'shipping':ship_mean.index, 'ship_mean':ship_mean.values}) merge = merge.merge(ship_mean, on=['shipping'], how='left') X_ship_mean = merge['ship_mean'].as_matrix().reshape(-1, 1) ''' del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = merge['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = merge['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) # X_len_description = normalize(np.nan_to_num(X_len_description).reshape(-1, 1), norm='max') # X_len_name = normalize(np.nan_to_num(X_len_name).reshape(-1, 1), norm='max') print('[{}] Length `item_description` completed.'.format(time.time() - start_time)) print('[{}] Length `name` completed.'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print( X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape ) #, X_glove.shape, X_len_description.shape, X_len_name.shape, X_cat_mean.shape) # sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) gc.collect() print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) gc.collect() print(predsFM) #model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) del X gc.collect() model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) print(predsF) del train_X, train_y del X_test return predsFM, predsF
for i_c, class_name in enumerate(class_names): class_pred = np.zeros(len(train)) train_target = train[class_name].values train_weight = np.array([1.0 if x==1 else class_weights[class_name] for x in train_target]) submission[class_name] = 0.0 cv_scores = [] for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_features)): clf = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=train_features.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm= 50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", e_clip=1.0, threads=4, use_avx= 1, verbose=1 ) clf.fit(train_features[trn_idx], train_target[trn_idx], train_weight[trn_idx], reset=False) class_pred[val_idx] = sigmoid(clf.predict(train_features[val_idx])) score = roc_auc_score(train_target[val_idx], class_pred[val_idx]) cv_scores.append(score) losses_per_folds[n_fold] += score / len(class_names) submission[class_name] += sigmoid(clf.predict(test_features)) / folds.n_splits #Classifier chain. Order of classes not optimized train_features = csr_matrix(hstack([train_features, np.reshape(np.array( [0 if x<0.5 else 1 for x in class_pred]), (train.shape[0], 1))])) test_features = csr_matrix(hstack([test_features, np.reshape(np.array( [0 if x<0.5 else 1 for x in submission[class_name]]), (test.shape[0], 1))])) cv_score = roc_auc_score(train_target, class_pred) losses.append(cv_score) train[class_name + "_oof"] = class_pred print('CV score for class %-15s is full %.6f | mean %.6f+%.6f'
class WordhashRegressor(object): def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch(normalize_text, stemmer=stemmer, extractor=(WordHash, { "decode_error": 'ignore', "n_features": 2**25, "non_negative": False, "ngram_range": (1, 2), "norm": 'l2' })) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def transform_batch(self, texts, batch_data): batch_data.texts = self.wb.fit_transform(texts, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 texts2 = [] batchsize = 100000 batch_data = BatchData() p_input = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: # if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 9 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p_input != None: p_input.join() texts2.append(batch_data.texts) p_input = threading.Thread( target=self.transform_batch, args=(texts, batch_data)) p_input.start() texts = [] if p_input != None: p_input.join() texts2.append(batch_data.texts) texts2.append(self.wb.fit_transform(texts, reset=False)) del (texts) if len(texts2) == 1: texts = texts2[0] else: texts = ssp.vstack(texts2) self.wb.dictionary_freeze = True self.clf.fit(texts, labels) if pickle_model != "": with gzip.open(pickle_model, 'wb') as model_file: pkl.dump((self.wb, self.clf), model_file, protocol=2) def predict(self, texts): counts = self.wb.transform(texts) return self.clf.predict(counts)
def wordbatch_algo(): import time # print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) train = pd.read_table('../input/train.tsv', engine='c') # Drop rows where price = 0 train = train[train.price != 0].reset_index(drop=True) print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) y = np.log1p(train["price"]) nrow_train = train.shape[0] # Training train['general_cat'], train['subcat_1'], train['subcat_2'] = \ zip(*train['category_name'].apply(lambda x: split_cat(x))) train.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(train) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(train) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(train) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = train['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) print('[{}] Length of text completed.'.format(time.time() - start_time)) # Name wb_name = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb_name.dictionary_freeze = True wb_name.fit(train['name']) X_name = wb_name.transform(train['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb_cat1 = CountVectorizer() wb_cat2 = CountVectorizer() wb_cat3 = CountVectorizer() wb_cat1.fit(train['general_cat']) wb_cat2.fit(train['subcat_1']) wb_cat3.fit(train['subcat_2']) X_category1 = wb_cat1.transform(train['general_cat']) X_category2 = wb_cat2.transform(train['subcat_1']) X_category3 = wb_cat3.transform(train['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb_desc = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb_desc.dictionary_freeze = True wb_desc.fit(train['item_description']) X_description = wb_desc.transform(train['item_description']) # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) lb.fit(train['brand_name']) X_brand = lb.transform(train['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist()) X_ship, d_ship = fit_dummy(train['shipping'].tolist()) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) del train gc.collect() print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_description, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge # --------------------------------------- # FM model fit train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) print('-' * 20) if develop: preds = model.predict(X=valid_X) print("->>>> FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # --------------------------------------- # FTRL model fit model2 = FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) # del X; gc.collect() model2.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model2.predict(X=valid_X) print("->>>> FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # Clear variables: del X, train_X, train_y, sparse_merge gc.collect() # --------------------------------------- # Testing by chunk print(' FM/FTRL: ...reading the test data...') predsFM = [] predsF = [] for test in load_test(): test['general_cat'], test['subcat_1'], test['subcat_2'] = \ zip(*test['category_name'].apply(lambda x: split_cat(x))) test.drop('category_name', axis=1, inplace=True) handle_missing_inplace(test) #print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(test) # print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(test) # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc_test = test['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name_test = test['name'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_name_test = wb_name.transform(test['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_category1_test = wb_cat1.transform(test['general_cat']) X_category2_test = wb_cat2.transform(test['subcat_1']) X_category3_test = wb_cat3.transform(test['subcat_2']) X_description_test = wb_desc.transform(test['item_description']) # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_brand_test = lb.transform(test['brand_name']) X_cond_test = transform_dummy(test['item_condition_id'].tolist(), d_cond) X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship) X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \ X_category2_test, X_category3_test, X_name_test)).tocsr() X_test = X_test[:, mask] # Clear variables: del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test del test gc.collect() predsFM_batch = model.predict(X_test) predsFM += np.array(predsFM_batch).flatten().tolist() predsF_batch = model2.predict(X_test) predsF += np.array(predsF_batch).flatten().tolist() print(np.array(predsFM)) print('-' * 20) print(np.array(predsF)) print('-' * 20) return np.array(predsFM), np.array(predsF)
print("Fitting FM_FTRL model on training examples...") #FM_FTRL_model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=mpr, alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, # D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) FM_FTRL_model = FM_FTRL(alpha=0.07, beta=0.05, L1=0.0001, L2=0.001,\ D=mpr, alpha_fm=0.1, L2_fm=0.000, init_fm=0.08, D_fm=100, e_noise=0.0001, iters=9, inv_link="identity", threads=4) FM_FTRL_model.fit(X_train, Y_train) Y_train = Y_train.reshape(-1, 1) Y_dev = Y_dev.reshape(-1, 1) Y_dev_preds_FM_FTRL = FM_FTRL_model.predict(X_dev) Y_dev_preds_FM_FTRL = Y_dev_preds_FM_FTRL.reshape(-1, 1) print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_FM_FTRL)) FM_FTRL_preds = FM_FTRL_model.predict(X_test) FM_FTRL_preds = np.expm1(FM_FTRL_preds) del FM_FTRL_model; gc.collect() sparse_merge = sparse_merge[:, np.where(sparse_merge.getnnz(axis=0) > 100)[0]] X_train = sparse_merge[:n_trains] Y_train = train_df.target.values X_dev = sparse_merge[n_trains:n_trains+n_devs] Y_dev = dev_df.target.values X_test = sparse_merge[n_trains+n_devs:]