def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') dev_preds = [] test_preds = [] print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) dev_preds.append(preds) predsF = model.predict(X_test) test_preds.append(predsF) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) dev_preds.append(preds) predsFM = model.predict(X_test) test_preds.append(predsFM) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) ''' train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) ''' d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) dev_preds.append(preds) predsL = model.predict(X_test) test_preds.append(predsL) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) #preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) best_ratios = get_best_ratios(dev_preds, valid_y) print(best_ratios) preds = aggregate_predicts_N(dev_preds, best_ratios) print("(Best) RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds)) preds = aggregate_predicts_N(test_preds, best_ratios) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) for c, (k, v) in enumerate(bigram_mapper.items()): print c, k, v merge['name'] = merge.name.str.replace(k, v) merge['item_description'] = merge.item_description.str.replace(k, v) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: lb = LabelBinarizer(sparse_output=True) if 'X_orig' not in locals(): X_orig = lb.fit_transform(merge[col].apply(hash)) else: X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash)))) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
def main(): feature_vectorized_file_name = 'Data/feature_vectorized2' if os.path.exists(feature_vectorized_file_name) == False: sparse_merge, price = _load(feature_vectorized_file_name) print(sparse_merge.shape) else: ######################################################################## start_time = time.time() merge, submission, price = get_extract_feature() merge = merge[:TRAIN_SIZE] merge['item_condition_id'] = merge['item_condition_id'].astype( 'category') print('[{}] Convert categorical completed'.format(time.time() - start_time)) # vectorize features wb = CountVectorizer() X_category2 = wb.fit_transform(merge['category_2']) X_category3 = wb.fit_transform(merge['category_name']) X_brand2 = wb.fit_transform(merge['brand_name']) print( '[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_category1 = lb.fit_transform(merge['category_1']) X_category4 = lb.fit_transform(merge['category_name']) print( '[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) # hand feature for col in merge.columns: if ('Len' in col) or ('Frec' in col): merge[col] = np.log1p(merge[col]) merge[col] = merge[col] / merge[col].max() hand_feature = [ 'brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', 'brand_name_item_description_Intsct' ] X_hand_feature = merge[hand_feature].values name_w1 = param_space_best_WordBatch['name_w1'] name_w2 = param_space_best_WordBatch['name_w2'] desc_w1 = param_space_best_WordBatch['desc_w1'] desc_w2 = param_space_best_WordBatch['desc_w2'] wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [name_w1, name_w2], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) merge['item_description'] = merge['category_2'].map(str)+' .#d3 .#d3 '+\ merge['name'].map(str)+' .#d3 .#d3 '+\ merge['item_description'].map(str) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 3, "hash_ngrams_weights": [desc_w1, desc_w2, 0.7], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 6, 0, 1), dtype=bool)] print( '[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1, X_category2, X_category3, X_category4, X_hand_feature, X_name, X_description)).tocsr() print(X_dummies.shape, X_brand.shape, X_brand2.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_category4.shape, X_hand_feature.shape, X_name.shape, X_description.shape, sparse_merge.shape) _save(feature_vectorized_file_name, [sparse_merge, price]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model # use 3 fold cross validation # learner_name='best_FTRL' # learner_name='FTRL' learner_name = 'best_FM_FTRL' #learner_name='FM_FTRL' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger) optimizer.run() a = 12
def get_pred_ftrl(submission): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table( '../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table( '../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] train = train[train["price"] != 0] #Xtrain,Xvalid = train_test_split(train, test_size=0.01,random_state=1) nrow_train = train.shape[0] #nrow_valid = Xvalid.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, test]) #submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_train:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y #''' if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) submission['price_FTRL'] = predsF #print(rmsle(np.expm1(predsF),y_valid)) #''' print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) submission['price_FM_FTRL'] = predsFM
del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True # p = Pool(processes=8) # merge['item_description'] = p.map(transform, merge.item_description.values) # p.terminate() X_description = wb.fit_transform(merge['item_description']) del(wb) gc.collect() X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
def main(): train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('Finished to load data') nrow_test = train.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train, test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('Split categories completed.') handle_missing_inplace(merge) print('Handle missing completed.') cutting(merge) print('Cut completed.') to_categorical(merge) print('Convert categorical completed') cv = CountVectorizer(min_df=NAME_MIN_DF) X_name_cv = cv.fit_transform(merge['name']) cv = CountVectorizer() X_category1_cv = cv.fit_transform(merge['general_cat']) X_category2_cv = cv.fit_transform(merge['subcat_1']) X_category3_cv = cv.fit_transform(merge['subcat_2']) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { 'hash_ngrams': 2, 'hash_ngrams_weights': [1.5, 1.0], 'hash_size': 2**29, 'norm': None, 'tf': 'binary', 'idf': None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del wb X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('Vectorize `name` completed.') wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('Count vectorize `categories` completed.') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { 'hash_ngrams': 2, 'hash_ngrams_weights': [1.0, 1.0], 'hash_size': 2**28, 'norm': 'l2', 'tf': 1.0, 'idf': None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del wb X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('Vectorize `item_description` completed.') lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('Label binarize `brand_name` completed.') X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('Get dummies on `item_condition_id` and `shipping` completed.') num_chars = merge['item_description'].apply(lambda x: len(x)).values num_words = merge['item_description'].apply( lambda x: len(x.split(' '))).values num_upper = merge['item_description'].apply( lambda x: len(re.findall('[A-Z]+', x))).values num_chars = num_chars / max(num_chars) num_words = num_words / max(num_words) num_upper = num_upper / max(num_upper) X_feature = np.vstack([num_chars, num_words, num_upper]).T print('musicmilif features completed.') sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature)).tocsr() print('Create sparse merge completed') del X_dummies, X_description, X_brand, X_category1, X_category2, X_category3 del X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature del num_chars, num_words, num_upper gc.collect() # Remove features with document frequency <=1 mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] gc.collect() train_X, train_y = X, y model = Ridge(solver='auto', fit_intercept=True, alpha=5.0, max_iter=100, normalize=False, tol=0.05) model.fit(train_X, train_y) print('Train Ridge completed') predsR = model.predict(X_test) print('Predict Ridge completed') model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('Train FTRL completed') predsF = model.predict(X_test) print('Predict FTRL completed') model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) print('Train FM_FTRL completed') predsFM = model.predict(X_test) print('Predict FM_FTRL completed') params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 9, 'num_leaves': 24, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.9, 'bagging_freq': 6, 'feature_fraction': 0.8, 'nthread': 4, 'min_data_in_leaf': 51, 'max_bin': 64 } # Remove features with document frequency <=200 mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 200, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] train_X, train_y = X, y d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] model = lgb.train(params, train_set=d_train, num_boost_round=1800, valid_sets=watchlist, early_stopping_rounds=500, verbose_eval=400) predsL = model.predict(X_test) print('Predict LGBM completed') preds = (predsR * 1 + predsF * 1 + predsFM * 16 + predsL * 6) / (1 + 1 + 16 + 6) submission['price'] = np.expm1(preds) submission.to_csv("submission.csv", index=False)