def main(conf): logging.info('Loading train dataset') train_df = load_train_df(conf['counters.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['counters.dataset']) logging.info('Computing question frequencies') compute_counters(train_df, test_df) logging.info('Writing dump') dump_dir = conf['counters.dump.dir'] try: makedirs(dump_dir) except OSError as e: if e.errno != errno.EEXIST: raise train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.freq_q1, FieldsTrain.freq_q2, FieldsTrain.intersect_q1_q2, FieldsTrain.intersect2_q1_q2 ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) test_df[[ FieldsTest.test_id, FieldsTest.freq_q1, FieldsTest.freq_q2, FieldsTest.intersect_q1_q2, FieldsTest.intersect2_q1_q2 ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['simplest.dump.dir'] try: makedirs(dump_dir) except OSError as e: if e.errno != errno.EEXIST: raise logging.info('Loading train dataset') train_df = load_train_df() logging.info('Loading test dataset') test_df = load_test_df() compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.len_q1, Fields.len_q2, Fields.diff_len, Fields.len_word_q1, Fields.len_word_q2, Fields.diff_len_word, Fields.len_char_q1, Fields.len_char_q2, Fields.diff_len_char ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.len_q1, Fields.len_q2, Fields.diff_len, Fields.len_word_q1, Fields.len_word_q2, Fields.diff_len_word, Fields.len_char_q1, Fields.len_char_q2, Fields.diff_len_char ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): logging.info('Loading training dataset') train_df = load_train_df(conf['kcores.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['kcores.dataset']) logging.info('Loading kcores dump') kcores = load_kcores(conf['kcores.source']) def substitute_kcores(q): return kcores.get(q.lower(), 0) train_df['q1_kcores'] = train_df.apply( lambda r: substitute_kcores(r['question1']), axis=1) train_df['q2_kcores'] = train_df.apply( lambda r: substitute_kcores(r['question2']), axis=1) test_df['q1_kcores'] = test_df.apply( lambda r: substitute_kcores(r['question1']), axis=1) test_df['q2_kcores'] = test_df.apply( lambda r: substitute_kcores(r['question2']), axis=1) logging.info('Writing dump') dump_dir = conf['kcores.dump.dir'] makedirs(dump_dir) train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.q1_kcores, FieldsTrain.q2_kcores ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) test_df[[FieldsTest.test_id, FieldsTest.q1_kcores, FieldsTest.q2_kcores]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['distances.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['distances.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['distances.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.jaccard, Fields.levenstein1, Fields.levenstein2, Fields.sorensen ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.jaccard, Fields.levenstein1, Fields.levenstein2, Fields.sorensen ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['cleaning']['dump']['dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df() logging.info('Cleaning train dataset') train_df[Fields.question1] = train_df[Fields.question1].apply( lambda q: clean(q, **conf['cleaning'])) train_df[Fields.question2] = train_df[Fields.question2].apply( lambda q: clean(q, **conf['cleaning'])) logging.info('Writing train dataset') train_df.to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Loading test dataset') test_df = load_test_df() logging.info('Cleaning test dataset') test_df[Fields.question1] = test_df[Fields.question1].apply( lambda q: clean(q, **conf['cleaning'])) test_df[Fields.question2] = test_df[Fields.question2].apply( lambda q: clean(q, **conf['cleaning'])) logging.info('Writing test dataset') test_df.to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): logging.info('Loading train dataset') train_df = load_train_df(conf['baseline.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['baseline.dataset']) logging.info('Computing baseline features') compute_features(train_df, test_df) logging.info('Writing dump') dump_dir = conf['baseline.dump.dir'] makedirs(dump_dir) train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.word_match, FieldsTrain.jaccard, FieldsTrain.wc_diff, FieldsTrain.wc_ratio, FieldsTrain.wc_diff_unique, FieldsTrain.wc_ratio_unique, FieldsTrain.wc_diff_unq_stop, FieldsTrain.wc_ratio_unique_stop, FieldsTrain.same_start, FieldsTrain.char_diff, FieldsTrain.char_diff_unq_stop, FieldsTrain.total_unique_words, FieldsTrain.total_unq_words_stop, FieldsTrain.char_ratio, FieldsTrain.tfidf_wm, FieldsTrain.tfidf_wm_stops ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) test_df[[ FieldsTest.test_id, FieldsTest.word_match, FieldsTest.jaccard, FieldsTest.wc_diff, FieldsTest.wc_ratio, FieldsTest.wc_diff_unique, FieldsTest.wc_ratio_unique, FieldsTest.wc_diff_unq_stop, FieldsTest.wc_ratio_unique_stop, FieldsTest.same_start, FieldsTest.char_diff, FieldsTest.char_diff_unq_stop, FieldsTest.total_unique_words, FieldsTest.total_unq_words_stop, FieldsTest.char_ratio, FieldsTest.tfidf_wm, FieldsTest.tfidf_wm_stops ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['exploration.dump.dir'] makedirs(dump_dir) notebook_file = join_path(dump_dir, conf['exploration.dump.notebook']) notebook_cells = [] images_dir = join_path(dump_dir, conf['exploration.dump.images.dir']) makedirs(images_dir) logging.info('Loading train dataset') train_df = load_train_df() y = train_df[[FieldsTrain.is_duplicate]].values.flatten() logging.info('Loading test dataset') test_df = load_test_df() logging.info('Loading features') features = [] for group, cnf in conf['features'].iteritems(): logging.info('Loading features group: %s', group) features_dump_dir = cnf['dump'] train_features_file = join_path(features_dump_dir, 'train.csv') test_features_file = join_path(features_dump_dir, 'test.csv') train_features = pd.read_csv(train_features_file) test_features = pd.read_csv(test_features_file) for fcnf in cnf['features']: feature = fcnf['feature'] features.append(feature) train_col = fcnf.get('train_col', feature) test_col = fcnf.get('test_col', feature) train_df[feature] = train_features[train_col] test_df[feature] = test_features[test_col] figure = plt.figure(1, figsize=[8, 6]) for feature in features: logging.info('Feature: %s', feature) train_stats = train_df[[feature]].describe() test_stats = test_df[[feature]].describe() cell = new_markdown_cell("# %s" % feature) notebook_cells.append(cell) sns.distplot(train_df[[feature]]) sns.distplot(test_df[[feature]]) image_file = join_path(images_dir, 'hist_%s.png' % feature) figure.savefig(image_file) plt.cla() nb = new_notebook(cells=notebook_cells) with open(notebook_file, 'w') as fh: nb_write(nb, fh)
def main(conf): dump_dir = conf['mephistopheies.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['mephistopheies.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['mephistopheies.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.unigram_all_jaccard, FieldsTrain.unigram_all_jaccard_max, FieldsTrain.bigram_all_jaccard, FieldsTrain.bigram_all_jaccard_max, FieldsTrain.trigram_all_jaccard, FieldsTrain.trigram_all_jaccard_max, FieldsTrain.trigram_tfidf_cosine, FieldsTrain.trigram_tfidf_l2_euclidean, FieldsTrain.m_q1_q2_tf_svd0, FieldsTrain.m_q1_q2_tf_svd1, FieldsTrain.m_q1_q2_tf_svd2, #FieldsTrain.m_w1l_tfidf_oof ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, FieldsTest.unigram_all_jaccard, FieldsTest.unigram_all_jaccard_max, FieldsTest.bigram_all_jaccard, FieldsTest.bigram_all_jaccard_max, FieldsTest.trigram_all_jaccard, FieldsTest.trigram_all_jaccard_max, FieldsTest.trigram_tfidf_cosine, FieldsTest.trigram_tfidf_l2_euclidean, FieldsTest.m_q1_q2_tf_svd0, FieldsTest.m_q1_q2_tf_svd1, FieldsTest.m_q1_q2_tf_svd2, #FieldsTest.m_w1l_tfidf_oof ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['tfidf.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['tfidf.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['tfidf.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.m_q1_q2_tf_svd0 ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[FieldsTest.test_id, Fields.m_q1_q2_tf_svd0]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['fuzzy.dump.dir'] makedirs(dump_dir) logging.info('Loading train dataset') train_df = load_train_df(conf['fuzzy.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['fuzzy.dataset']) compute_features(train_df, test_df) logging.info('Writing train dataset to disk') train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Writing test dataset to disk') test_df[[ FieldsTest.test_id, Fields.qratio, Fields.wratio, Fields.partial_ratio, Fields.partial_token_set_ratio, Fields.partial_token_sort_ratio, Fields.token_set_ratio, Fields.token_sort_ratio ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['svdff.dump.dir'] makedirs(dump_dir) dump_config_file = join_path(dump_dir, 'application.conf') dump_config(conf, dump_config_file) logging.info('Loading train dataset') train_df = load_train_df(conf['svdff.dataset']) y = train_df['is_duplicate'].values vectorizer_file = join_path(dump_dir, 'vectorizer.pkl') try: logging.info('Loading vectorizer dump') vectorizer = joblib.load(vectorizer_file) except: logging.info('Loading vectorizer dump failed') logging.info('Traininig vectorizer') vectorizer = train_vectorizer(train_df, **conf['svdff.vectorizer']) logging.info('Writing vectorizer dump') joblib.dump(vectorizer, vectorizer_file) features_file = join_path(dump_dir, 'features_train.npz') logging.info('Loading cached train feature matrix from %s', features_file) X = load_feature_matrix(features_file) if X is None: logging.info('Unable to load cached train feature matrix') logging.info('Computing train feature matrix') X = compute_feature_matrix(train_df, vectorizer, combine='stack') logging.info('Writing train feature matrix to %s', features_file) save_feature_matrix(X, features_file) logging.info('Loading SVD decomposition') k = conf['svdff.svd'].get_int('k') singular_values_file = join_path(dump_dir, 'singular_values.txt') singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz') try: S = np.loadtxt(singular_values_file) VT = np.load(singular_vectors_file)['VT'] assert k == len(S) except: logging.info('Loading SVD decomposition failed') logging.info('Computing SVD decomposition') S, VT = compute_svd(X.asfptype(), **conf['svdff.svd']) logging.info('Writing singular values to file') np.savetxt(singular_values_file, S) np.savez(singular_vectors_file, VT=VT) plot_singular_values(S, dump_dir) logging.info('Computing train SVD features') Sinv = np.diag(1. / S) * np.sqrt(X.shape[0]) U = X.dot(VT.transpose().dot(Sinv)) logging.info('Train feature matrix dimensions: %s', U.shape) logging.info('Symmetrizing input features') Uq1, Uq2 = np.vsplit(U, 2) U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0]) logging.info('Training feature matrix: %s', U.shape) logging.info('Training feed-forward neural networks') quality, predictions = train_ff(U, y, skfold(), dump_dir=dump_dir, **conf['svdff.ff']) logging.info('Plotting quality metrics') quality_dir = join_path(dump_dir, 'quality') makedirs(quality_dir) for q in quality['folds']: img_dir = join_path(quality_dir, 'fold%d' % q['fold']) makedirs(img_dir) plot_quality(q, img_dir) logging.info('Writing train features') train_df['svdff'] = predictions train_df[[FieldsTrain.id, FieldsTrain.is_duplicate, 'svdff']].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Loading test dataset') test_df = load_test_df(conf['svdff.dataset']) logging.info('Computing test features') X = compute_feature_matrix(test_df, vectorizer, combine='stack') logging.info('Computing test SVD features') U = X.dot(VT.transpose().dot(Sinv)) logging.info('Symmetrizing input features') Uq1, Uq2 = np.vsplit(U, 2) U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0]) logging.info('Applying models to test dataset') test_df['svdff'] = np.zeros(U.shape[0]) for q in quality['folds']: f = load_model(q['dump']) p = f.predict_proba(U).flatten() test_df['svdff'] = test_df['svdff'] + logit(p) test_df['svdff'] = test_df['svdff'] / len(quality['folds']) logging.info('Writing test dataset') test_df[[ FieldsTest.test_id, 'svdff', ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['glove.dump.dir'] makedirs(dump_dir) logging.warning('Loading train dataset') train_df = load_train_df(conf['glove.dataset']) logging.warning('Loading test dataset') test_df = load_test_df(conf['glove.dataset']) logging.warning('Loading embeddings') embeddings_dir = conf['glove.embeddings.dir'] embeddings_file = join_path(embeddings_dir, conf['glove.embeddings.file']) glove = gensim.models.KeyedVectors.load_word2vec_format(embeddings_file, binary=False) glove.init_sims(replace=True) processor = Glove(glove) logging.warning('Computing train features') train_df[Fields.glove_wmd], \ train_df[Fields.glove_cos], \ train_df[Fields.glove_city], \ train_df[Fields.glove_jacc], \ train_df[Fields.glove_canb], \ train_df[Fields.glove_eucl], \ train_df[Fields.glove_mink], \ train_df[Fields.glove_bray], \ train_df[Fields.glove_skew_q1], \ train_df[Fields.glove_skew_q2], \ train_df[Fields.glove_kurt_q1], \ train_df[Fields.glove_kurt_q2] = \ zip(*train_df.progress_apply(lambda r: processor.features(r['question1'], r['question2']), axis=1)) for feature in [f for f in dir(Fields()) if f.startswith('glove')]: logging.warning( 'Feature %s AUC=%s', feature, roc_auc_score(train_df[FieldsTrain.is_duplicate], train_df[feature])) logging.warning('Writing train feature dump') train_df.drop([ Fields.question1, Fields.question2, FieldsTrain.qid1, FieldsTrain.qid2 ], axis=1, inplace=True) train_df.to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.warning('Computing test features') test_df[Fields.glove_wmd], \ test_df[Fields.glove_cos], \ test_df[Fields.glove_city], \ test_df[Fields.glove_jacc], \ test_df[Fields.glove_canb], \ test_df[Fields.glove_eucl], \ test_df[Fields.glove_mink], \ test_df[Fields.glove_bray], \ test_df[Fields.glove_skew_q1], \ test_df[Fields.glove_skew_q2], \ test_df[Fields.glove_kurt_q1], \ test_df[Fields.glove_kurt_q2] = \ zip(*test_df.progress_apply(lambda r: processor.features(r['question1'], r['question2']), axis=1)) logging.warning('Writing test feature dump') test_df.drop([Fields.question1, Fields.question2], axis=1, inplace=True) test_df.to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): logging.info('Loading train dataset') train_df = load_train_df(conf['dataset_raw']) logging.info('Loading test dataset') test_df = load_test_df(conf['dataset_raw']) class_weight = {int(c['class']): c['weight'] for c in conf['weights']} for w, cnf in conf['linear'].iteritems(): if not cnf.get_bool('enabled', True): continue if w == 'dataset': continue logging.info('Start training linear model: %s', w) dump_dir = cnf.get('dump.dir') or '.' makedirs(dump_dir) config_file = join_path(dump_dir, 'application.conf') dump_config(conf, config_file) vectorizer_file = join_path(dump_dir, 'vectorizer.pkl') quality_file = join_path(dump_dir, 'quality.json') y = train_df[FieldsTrain.is_duplicate] if cnf['dump.cache.enabled']: logging.info('Loading vectorizer') try: vectorizer = joblib.load(vectorizer_file) except: logging.info('Unable to load vectorizer') vectorizer = None if vectorizer is None: logging.info('Training vectorizer') vectorizer = train_vectorizer(train_df, **cnf['vectorizer']) nf = len(vectorizer.vocabulary_) logging.info('Feature count: %d', nf) logging.info('Dumping vectorizer') joblib.dump(vectorizer, vectorizer_file) features_cache_file = join_path(dump_dir, cnf['dump.cache.train']) logging.info('Loading cached train feature matrix from %s', features_cache_file) X = load_feature_matrix(features_cache_file) if X is None: logging.info('Unable to load cached train feature matrix') logging.info('Computing train feature matrix') X = compute_feature_matrix(train_df, vectorizer, combine=cnf['combine']) logging.info('Writing train feature matrix to %s', features_cache_file) save_feature_matrix(X, features_cache_file) else: logging.info('Training vectorizer') vectorizer = train_vectorizer(train_df, **cnf['vectorizer']) X = compute_feature_matrix(train_df, vectorizer, combine=cnf['combine']) nf = len(vectorizer.vocabulary_) logging.info('Feature count: %d', nf) logging.info('Training feature matrix: %s', X.shape) quality, predictions = train(X, y, skfold(), class_weight, dump_dir=dump_dir, **cnf['model']) with open(quality_file, 'w') as qfh: json.dump(quality, qfh) logging.info('Writing train set to disk') train_df[FieldsTrain.linear] = predictions train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.linear ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) if cnf['dump.cache.enabled']: features_cache_file = join_path(dump_dir, cnf['dump.cache.test']) logging.info('Loading cached test feature matrix from %s', features_cache_file) X = load_feature_matrix(features_cache_file) if X is None: logging.info('Unable to load cached test feature matrix') logging.info('Computing test feature matrix') X = compute_feature_matrix(test_df, vectorizer, combine=cnf['combine']) logging.info('Writing test feature matrix to cache') save_feature_matrix(X, features_cache_file) else: logging.info('Computing test feature matrix') X = compute_feature_matrix(test_df, vectorizer, combine=cnf['combine']) logging.info( 'Computing test predictions as average logit of cross-validation models' ) test_df[FieldsTest.linear_cv] = np.zeros(X.shape[0]) for fold in quality['folds']: f = joblib.load(fold['dump']) p = logit(f.predict_proba(X)[:, 1]) test_df[FieldsTest.linear_cv] = test_df[FieldsTest.linear_cv] + p test_df[FieldsTest.linear_cv] = test_df[FieldsTest.linear_cv] / len( quality['folds']) logging.info('Computing test predictions with full model') f = joblib.load(quality['full']['unweighted']['dump']) p = logit(f.predict_proba(X)[:, 1]) test_df[FieldsTest.linear_full] = p logging.info('Computing test predictions with full weighted model') f = joblib.load(quality['full']['weighted']['dump']) p = logit(f.predict_proba(X)[:, 1]) test_df[FieldsTest.linear_full_weighted] = p logging.info('Writing test set to disk') test_df[[ FieldsTest.test_id, FieldsTest.linear_cv, FieldsTest.linear_full, FieldsTest.linear_full_weighted ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): logging.info('Loading train dataset') train_df = load_train_df(conf['svd.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['svd.dataset']) for f, cnf in conf['svd'].iteritems(): if f == 'dataset': continue if not cnf.get('enabled', True): continue logging.info('Start traning SVD model %s', f) dump_dir = cnf['dump.dir'] makedirs(dump_dir) logging.info('Dump %s', dump_dir) vectorizer_file = join_path(dump_dir, 'vectorizer.pkl') try: logging.info('Loading vectorizer dump') vectorizer = joblib.load(vectorizer_file) except: logging.info('Loading vectorizer dump failed') logging.info('Traininig vectorizer: %s', cnf['vectorizer']) vectorizer = train_vectorizer(train_df, **cnf['vectorizer']) logging.info('Writing vectorizer dump') joblib.dump(vectorizer, vectorizer_file) train_features_matrix_file = join_path(dump_dir, 'train_features.npz') logging.info('Loading train features matrix') X = load_feature_matrix(train_features_matrix_file) if X is None: logging.info('Loading train feature matrix failed') logging.info('Computing train feature matrix') X = compute_feature_matrix(train_df, vectorizer, combine=cnf.get('model.transform', None)) logging.info('Writing train feature matrix dump') save_feature_matrix(X, train_features_matrix_file) logging.info('Computing SVD decomposition') ksvd = cnf['model'].get_int('k') S, VT = compute_svd(X.asfptype(), **cnf['model']) Sinv = np.diag(1. / S) * np.sqrt(X.shape[0]) logging.info('Singular values %s', S) logging.info('Computing train SVD features') U = X.dot(VT.transpose()).dot(Sinv) logging.info('Train features variance: %s', np.var(U, axis=0)) features = map(lambda i: f + '_%d' % i, range(U.shape[1])) if cnf.get('model.transform', None) == 'stack': features_q1 = map(lambda s: s + '_q1', features) features_q2 = map(lambda s: s + '_q2', features) features = features_q1 + features_q2 train_features_df_q1 = pd.DataFrame(U[:train_df.shape[0], :], columns=features_q1) train_features_df_q2 = pd.DataFrame(U[train_df.shape[0]:, :], columns=features_q2) train_df = pd.concat([train_df, train_features_df_q1, train_features_df_q2], axis=1) train_df['svd_dist_eucl'] = train_df.apply(lambda r: compute_svd_distance_eucl(r, f, ksvd), axis=1) features.append('svd_dist_eucl') else: train_features_df = pd.DataFrame(U, columns=features) train_df = pd.concat([train_df, train_features_df], axis=1) for feature in features: logging.info('Feature %s AUC=%s', feature, roc_auc_score(train_df[FieldsTrain.is_duplicate], train_df[feature])) logging.info('Writing train features dump') train_file = join_path(dump_dir, 'train.csv') train_df[[FieldsTrain.id, FieldsTrain.is_duplicate] + features].to_csv(train_file, index=False) test_features_matrix_file = join_path(dump_dir, 'test_features.npz') logging.info('Loading test features matrix') X = load_feature_matrix(test_features_matrix_file) if X is None: logging.info('Loading test feature matrix failed') logging.info('Computing test feature matrix') X = compute_feature_matrix(test_df, vectorizer, combine=cnf.get('model.transform', None)) logging.info('Writing test feature matrix dump') save_feature_matrix(X, test_features_matrix_file) U = X.dot(VT.transpose()).dot(Sinv) logging.info('Test features variance: %s', np.var(U, axis=0)) logging.info('Computing test SVD features') if cnf.get('model.transform', None) == 'stack': logging.info('Computing q1 test SVD features') test_features_df_q1 = pd.DataFrame(U[:test_df.shape[0], :], columns=features_q1) test_df = pd.concat([test_df, test_features_df_q1], axis=1) del test_features_df_q1 logging.info('Computing q2 test SVD features') test_features_df_q2 = pd.DataFrame(U[test_df.shape[0]:, :], columns=features_q2) test_df = pd.concat([test_df, test_features_df_q2], axis=1) del test_features_df_q2 logging.info('Computing svd distances') test_df['svd_dist_eucl'] = test_df.apply(lambda r: compute_svd_distance_eucl(r, f, ksvd), axis=1) else: test_features_df = pd.DataFrame(U, columns=features) test_df = pd.concat([test_df, test_features_df], axis=1) logging.info('Writing test features dump') test_file = join_path(dump_dir, 'test.csv') test_df[[FieldsTest.test_id] + features].to_csv(test_file, index=False)
def main(conf): dump_dir = conf['svdres.dump.dir'] makedirs(dump_dir) dump_config_file = join_path(dump_dir, 'application.conf') dump_config(conf, dump_config_file) logging.info('Loading train dataset') train_df = load_train_df(conf['svdres.dataset']) vectorizer_file = join_path(dump_dir, 'vectorizer.pkl') try: logging.info('Loading vectorizer dump') vectorizer = joblib.load(vectorizer_file) except: logging.info('Loading vectorizer dump failed') logging.info('Traininig vectorizer') vectorizer = train_vectorizer(train_df, **conf['svdres.vectorizer']) logging.info('Writing vectorizer dump') joblib.dump(vectorizer, vectorizer_file) features_file = join_path(dump_dir, 'features_train.npz') logging.info('Loading cached train feature matrix from %s', features_file) X = load_feature_matrix(features_file) if X is None: logging.info('Unable to load cached train feature matrix') logging.info('Computing train feature matrix') X = compute_feature_matrix(train_df, vectorizer, combine='stack') logging.info('Writing train feature matrix to %s', features_file) save_feature_matrix(X, features_file) logging.info('Loading SVD decomposition') k = conf['svdres.svd'].get_int('k') singular_values_file = join_path(dump_dir, 'singular_values.txt') singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz') try: S = np.loadtxt(singular_values_file) VT = np.load(singular_vectors_file)['VT'] assert k == len(S) except: logging.info('Loading SVD decomposition failed') logging.info('Computing SVD decomposition') S, VT = compute_svd(X.asfptype(), **conf['svdres.svd']) logging.info('Writing singular values to file') np.savetxt(singular_values_file, S) np.savez(singular_vectors_file, VT=VT) logging.info('Train matrix %s', X.shape) logging.info('Computing train SVD residuals') L = X.shape[0] / 2 Xq1 = X[:L, :] Xq2 = X[L:, :] start = 0 batch = 100 eucl = np.zeros(Xq1.shape[0]) cos = np.zeros(Xq1.shape[0]) q1res = np.zeros(Xq1.shape[0]) q2res = np.zeros(Xq1.shape[0]) while start < Xq1.shape[0]: finish = min(start + batch, Xq1.shape[0]) Xq1_batch = Xq1[start:finish, :] nq1 = (Xq1_batch.multiply(Xq1_batch)).sum(axis=1).flatten() Rq1 = safe_sparse_dot(Xq1_batch, VT.transpose()).dot(VT) - Xq1_batch nrq1 = np.sum(np.multiply(Rq1, Rq1), axis=1).flatten() Xq2_batch = Xq2[start:finish, :] nq2 = (Xq2_batch.multiply(Xq2_batch)).sum(axis=1).flatten() Rq2 = safe_sparse_dot(Xq2_batch, VT.transpose()).dot(VT) - Xq2_batch nrq2 = np.sum(np.multiply(Rq2, Rq2), axis=1).flatten() q1res[start:finish] = np.sqrt(nrq1) / np.sqrt(nq1) q2res[start:finish] = np.sqrt(nrq2) / np.sqrt(nq2) eucl[start:finish] = euclidean(Rq1, Rq2).flatten() cos[start:finish] = cosine(Rq1, Rq2).flatten() start = finish train_df['svd_res_q1'] = q1res train_df['svd_res_q2'] = q2res train_df['svd_res_eucl'] = eucl train_df['svd_res_cos'] = cos train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, 'svd_res_q1', 'svd_res_q2', 'svd_res_eucl', 'svd_res_cos' ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Loading test dataset') test_df = load_test_df(conf['svddist.dataset']) logging.info('Computing test features') X = compute_feature_matrix(test_df, vectorizer, combine='stack') logging.info('Computing train SVD residuals') L = X.shape[0] / 2 Xq1 = X[:L, :] Xq2 = X[L:, :] start = 0 batch = 100 eucl = np.zeros(Xq1.shape[0]) cos = np.zeros(Xq1.shape[0]) q1res = np.zeros(Xq1.shape[0]) q2res = np.zeros(Xq1.shape[0]) while start < Xq1.shape[0]: finish = min(start + batch, Xq1.shape[0]) Xq1_batch = Xq1[start:finish, :] nq1 = (Xq1_batch.multiply(Xq1_batch)).sum(axis=1).flatten() Rq1 = safe_sparse_dot(Xq1_batch, VT.transpose()).dot(VT) - Xq1_batch nrq1 = np.sum(np.multiply(Rq1, Rq1), axis=1).flatten() Xq2_batch = Xq2[start:finish, :] nq2 = (Xq2_batch.multiply(Xq2_batch)).sum(axis=1).flatten() Rq2 = safe_sparse_dot(Xq2_batch, VT.transpose()).dot(VT) - Xq2_batch nrq2 = np.sum(np.multiply(Rq2, Rq2), axis=1).flatten() q1res[start:finish] = np.sqrt(nrq1) / np.sqrt(nq1) q2res[start:finish] = np.sqrt(nrq2) / np.sqrt(nq2) eucl[start:finish] = euclidean(Rq1, Rq2).flatten() cos[start:finish] = cosine(Rq1, Rq2).flatten() start = finish test_df['svd_res_q1'] = q1res test_df['svd_res_q2'] = q2res test_df['svd_res_eucl'] = eucl test_df['svd_res_cos'] = cos logging.info('Writing test dataset') test_df[[ FieldsTest.test_id, 'svd_res_q1', 'svd_res_q2', 'svd_res_eucl', 'svd_res_cos' ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['xgboost.dump.dir'] makedirs(dump_dir) dump_config_file = join_path(dump_dir, 'application.conf') dump_config(conf, dump_config_file) logging.info('Loading train dataset') train_df = load_train_df(conf['xgboost.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['xgboost.dataset']) logging.info('Loading features') features = [] for group, cnf in conf['features'].iteritems(): logging.info('Loading features group: %s', group) features_dump_dir = cnf['dump'] train_features_file = join_path(features_dump_dir, 'train.csv') test_features_file = join_path(features_dump_dir, 'test.csv') train_features = pd.read_csv(train_features_file) test_features = pd.read_csv(test_features_file) for fcnf in cnf['features']: feature = fcnf['feature'] features.append(feature) train_col = fcnf.get('train_col', feature) test_col = fcnf.get('test_col', feature) train_df[feature] = train_features[train_col] test_df[feature] = test_features[test_col] feature_map_file = join_path(dump_dir, 'xgb.fmap') create_feature_map(features, feature_map_file) train_df_flipped = train_df.copy() for flip in conf['flip']: train_df_flipped[flip[0]] = train_df[[flip[1]]] train_df_flipped[flip[1]] = train_df[[flip[0]]] train_df = pd.concat([train_df, train_df_flipped], axis=0, ignore_index=True) logging.info('Train dataset: %s', train_df.shape) y = train_df[[FieldsTrain.is_duplicate]].values.flatten() logging.info('Train dataset CTR: %s', y.sum() / len(y)) class_weight = {int(c['class']): c['weight'] for c in conf['weights']} w = np.vectorize(class_weight.get)(y) logging.info('Train dataset weighted CTR: %s', sum(y * w) / sum(w)) q1 = train_df[Fields.question1].values q2 = train_df[Fields.question2].values train_df.drop([ FieldsTrain.id, FieldsTrain.qid1, FieldsTrain.qid2, FieldsTrain.question1, FieldsTrain.question2, FieldsTrain.is_duplicate], axis=1, inplace=True) X = train_df.values logging.info('Training XGBoost model') model, progress, quality = train_xgboost(X, y, w, **conf['xgboost.param']) logging.info('Writing model dump') model_dump_file = join_path(dump_dir, 'model_dump.txt') model.dump_model(model_dump_file, fmap=feature_map_file, with_stats=True) model_file = join_path(dump_dir, 'model.bin') model.save_model(model_file) logging.info('Writing quality') # plot_quality(quality, dump_dir) logging.info('Writing top errors') errors_file = join_path(dump_dir, 'errors.csv') with open(errors_file, 'w') as fh: fh.write('y,p,question1,question2,sample\n') for e in quality['errors']['train']['type_i']: fh.write('%d,%s,%s,%s,%s\n' % (0, e[0], q1[e[1]], q2[e[1]], 'train')) for e in quality['errors']['train']['type_ii']: fh.write('%d,%s,%s,%s,%s\n' % (1, e[0], q1[e[1]], q2[e[1]], 'train')) for e in quality['errors']['valid']['type_i']: fh.write('%d,%s,%s,%s,%s\n' % (0, e[0], q1[e[1]], q2[e[1]], 'valid')) for e in quality['errors']['valid']['type_ii']: fh.write('%d,%s,%s,%s,%s\n' % (1, e[0], q1[e[1]], q2[e[1]], 'valid')) logging.info('Writing progress file') # plot_progress(progress, dump_dir) progress_file = join_path(dump_dir, 'progress.json') with open(progress_file, 'w') as fh: json.dump(progress, fh) logging.info('Writing feature scores') score_weight = model.get_score(fmap=feature_map_file, importance_type='weight') score_gain = model.get_score(fmap=feature_map_file, importance_type='gain') score_cover = model.get_score(fmap=feature_map_file, importance_type='cover') split_histograms = dict() for f in features: split_histograms[f] = model.get_split_value_histogram(f, fmap=feature_map_file) scores = pd.DataFrame([score_weight, score_gain, score_cover]).transpose() scores.index.name = 'feature' scores.rename(columns={0: 'weight', 1: 'gain', 2: 'cover'}, inplace=True) weight_total = scores['weight'].sum() scores['weight'] = scores['weight'] / weight_total scores.sort_values(by='weight', ascending=False, inplace=True) scores.to_csv(join_path(dump_dir, 'feature_scores.csv')) logging.info('Computing test predictions') test_ids = test_df[[FieldsTest.test_id]] test_df.drop([FieldsTest.test_id, FieldsTest.question1, FieldsTest.question2], axis=1, inplace=True) dtest = xgb.DMatrix(test_df.values) p_test = model.predict(dtest) logging.info('Writing submission file') submission_file = join_path(dump_dir, 'submission.csv') submission(submission_file, test_ids, p_test)
def main(conf): dump_dir = conf['svddist.dump.dir'] makedirs(dump_dir) dump_config_file = join_path(dump_dir, 'application.conf') dump_config(conf, dump_config_file) logging.info('Loading train dataset') train_df = load_train_df(conf['svddist.dataset']) vectorizer_file = join_path(dump_dir, 'vectorizer.pkl') try: logging.info('Loading vectorizer dump') vectorizer = joblib.load(vectorizer_file) except: logging.info('Loading vectorizer dump failed') logging.info('Traininig vectorizer') vectorizer = train_vectorizer(train_df, **conf['svddist.vectorizer']) logging.info('Writing vectorizer dump') joblib.dump(vectorizer, vectorizer_file) features_file = join_path(dump_dir, 'features_train.npz') logging.info('Loading cached train feature matrix from %s', features_file) X = load_feature_matrix(features_file) if X is None: logging.info('Unable to load cached train feature matrix') logging.info('Computing train feature matrix') X = compute_feature_matrix(train_df, vectorizer, combine='stack') logging.info('Writing train feature matrix to %s', features_file) save_feature_matrix(X, features_file) logging.info('Loading SVD decomposition') k = conf['svddist.svd'].get_int('k') singular_values_file = join_path(dump_dir, 'singular_values.txt') singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz') try: S = np.loadtxt(singular_values_file) VT = np.load(singular_vectors_file)['VT'] assert k == len(S) except: logging.info('Loading SVD decomposition failed') logging.info('Computing SVD decomposition') S, VT = compute_svd(X.asfptype(), **conf['svddist.svd']) logging.info('Writing singular values to file') np.savetxt(singular_values_file, S) np.savez(singular_vectors_file, VT=VT) logging.info('Computing train SVD features') Sinv = np.diag(1. / S) * np.sqrt(X.shape[0]) U = X.dot(VT.transpose().dot(Sinv)) logging.info('Train feature matrix dimensions: %s', U.shape) logging.info('Symmetrizing input features') Uq1, Uq2 = np.vsplit(U, 2) del U logging.info('Computing euclidean') train_df['svd_eucl'] = euclidean(Uq1, Uq2) logging.info('Computing cosine') train_df['svd_cosine'] = cosine(Uq1, Uq2) del Uq1, Uq2 train_df[[ FieldsTrain.id, FieldsTrain.is_duplicate, 'svd_eucl', 'svd_cosine' ]].to_csv(join_path(dump_dir, 'train.csv'), index=False) logging.info('Loading test dataset') test_df = load_test_df(conf['svddist.dataset']) logging.info('Computing test features') X = compute_feature_matrix(test_df, vectorizer, combine='stack') logging.info('Computing test SVD features') U = X.dot(VT.transpose().dot(Sinv)) logging.info('Symmetrizing input features') Uq1, Uq2 = np.vsplit(U, 2) del U logging.info('Computing test euclidean') test_df['svd_eucl'] = euclidean(Uq1, Uq2) logging.info('Computing test cosine') test_df['svd_cosine'] = cosine(Uq1, Uq2) del Uq1, Uq2 logging.info('Writing test dataset') test_df[[ FieldsTest.test_id, 'svd_eucl', 'svd_cosine' ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf): dump_dir = conf['xgboost.dump.dir'] makedirs(dump_dir) dump_config_file = join_path(dump_dir, 'application.conf') dump_config(conf, dump_config_file) logging.info('Loading train dataset') train_df = load_train_df(conf['xgboost.dataset']) logging.info('Loading test dataset') test_df = load_test_df(conf['xgboost.dataset']) logging.info('Loading features') features = [] for group, cnf in conf['features'].iteritems(): logging.info('Loading features group: %s', group) features_dump_dir = cnf['dump'] train_features_file = join_path(features_dump_dir, 'train.csv') test_features_file = join_path(features_dump_dir, 'test.csv') train_features = pd.read_csv(train_features_file) test_features = pd.read_csv(test_features_file) for fcnf in cnf['features']: feature = fcnf['feature'] features.append(feature) train_col = fcnf.get('train_col', feature) test_col = fcnf.get('test_col', feature) train_df[feature] = train_features[train_col] test_df[feature] = test_features[test_col] feature_map_file = join_path(dump_dir, 'xgb.fmap') create_feature_map(features, feature_map_file) train_df_flipped = train_df.copy() for flip in conf['flip']: train_df_flipped[flip[0]] = train_df[[flip[1]]] train_df_flipped[flip[1]] = train_df[[flip[0]]] train_df = pd.concat([train_df, train_df_flipped], axis=0, ignore_index=True) logging.info('Train dataset: %s', train_df.shape) y = train_df[[FieldsTrain.is_duplicate]].values.flatten() logging.info('Train dataset CTR: %s', y.sum() / len(y)) class_weight = {int(c['class']): c['weight'] for c in conf['weights']} w = np.vectorize(class_weight.get)(y) logging.info('Train dataset weighted CTR: %s', sum(y * w) / sum(w)) q1 = train_df[Fields.question1].values q2 = train_df[Fields.question2].values train_df.drop([ FieldsTrain.id, FieldsTrain.qid1, FieldsTrain.qid2, FieldsTrain.question1, FieldsTrain.question2, FieldsTrain.is_duplicate ], axis=1, inplace=True) logging.info('Computing test predictions') test_ids = test_df[[FieldsTest.test_id]] test_df.drop( [FieldsTest.test_id, FieldsTest.question1, FieldsTest.question2], axis=1, inplace=True) dtest = xgb.DMatrix(test_df.values) model = xgb.Booster({'nthread': 4}) model.load_model(join_path(dump_dir, 'model.bin')) p_test = model.predict(dtest) logging.info('Writing submission file') submission_file = join_path(dump_dir, 'submission.csv') submission(submission_file, test_ids, p_test)