def main(conf):
    logging.info('Loading train dataset')
    train_df = load_train_df(conf['counters.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['counters.dataset'])

    logging.info('Computing question frequencies')
    compute_counters(train_df, test_df)

    logging.info('Writing dump')
    dump_dir = conf['counters.dump.dir']

    try:
        makedirs(dump_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.freq_q1,
        FieldsTrain.freq_q2, FieldsTrain.intersect_q1_q2,
        FieldsTrain.intersect2_q1_q2
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    test_df[[
        FieldsTest.test_id, FieldsTest.freq_q1, FieldsTest.freq_q2,
        FieldsTest.intersect_q1_q2, FieldsTest.intersect2_q1_q2
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    dump_dir = conf['simplest.dump.dir']

    try:
        makedirs(dump_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    logging.info('Loading train dataset')
    train_df = load_train_df()

    logging.info('Loading test dataset')
    test_df = load_test_df()

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, Fields.len_q1, Fields.len_q2,
        Fields.diff_len, Fields.len_word_q1, Fields.len_word_q2,
        Fields.diff_len_word, Fields.len_char_q1, Fields.len_char_q2,
        Fields.diff_len_char
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id, Fields.len_q1, Fields.len_q2, Fields.diff_len,
        Fields.len_word_q1, Fields.len_word_q2, Fields.diff_len_word,
        Fields.len_char_q1, Fields.len_char_q2, Fields.diff_len_char
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    logging.info('Loading training dataset')
    train_df = load_train_df(conf['kcores.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['kcores.dataset'])

    logging.info('Loading kcores dump')
    kcores = load_kcores(conf['kcores.source'])

    def substitute_kcores(q):
        return kcores.get(q.lower(), 0)

    train_df['q1_kcores'] = train_df.apply(
        lambda r: substitute_kcores(r['question1']), axis=1)
    train_df['q2_kcores'] = train_df.apply(
        lambda r: substitute_kcores(r['question2']), axis=1)

    test_df['q1_kcores'] = test_df.apply(
        lambda r: substitute_kcores(r['question1']), axis=1)
    test_df['q2_kcores'] = test_df.apply(
        lambda r: substitute_kcores(r['question2']), axis=1)

    logging.info('Writing dump')
    dump_dir = conf['kcores.dump.dir']
    makedirs(dump_dir)

    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.q1_kcores,
        FieldsTrain.q2_kcores
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    test_df[[FieldsTest.test_id, FieldsTest.q1_kcores,
             FieldsTest.q2_kcores]].to_csv(join_path(dump_dir, 'test.csv'),
                                           index=False)
def main(conf):
    dump_dir = conf['distances.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['distances.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['distances.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        Fields.jaccard,
        Fields.levenstein1,
        Fields.levenstein2,
        Fields.sorensen
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        Fields.jaccard,
        Fields.levenstein1,
        Fields.levenstein2,
        Fields.sorensen
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    dump_dir = conf['cleaning']['dump']['dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df()

    logging.info('Cleaning train dataset')
    train_df[Fields.question1] = train_df[Fields.question1].apply(
        lambda q: clean(q, **conf['cleaning']))
    train_df[Fields.question2] = train_df[Fields.question2].apply(
        lambda q: clean(q, **conf['cleaning']))

    logging.info('Writing train dataset')
    train_df.to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Loading test dataset')
    test_df = load_test_df()

    logging.info('Cleaning test dataset')
    test_df[Fields.question1] = test_df[Fields.question1].apply(
        lambda q: clean(q, **conf['cleaning']))
    test_df[Fields.question2] = test_df[Fields.question2].apply(
        lambda q: clean(q, **conf['cleaning']))

    logging.info('Writing test dataset')
    test_df.to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    logging.info('Loading train dataset')
    train_df = load_train_df(conf['baseline.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['baseline.dataset'])

    logging.info('Computing baseline features')
    compute_features(train_df, test_df)

    logging.info('Writing dump')
    dump_dir = conf['baseline.dump.dir']
    makedirs(dump_dir)

    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.word_match,
        FieldsTrain.jaccard, FieldsTrain.wc_diff, FieldsTrain.wc_ratio,
        FieldsTrain.wc_diff_unique, FieldsTrain.wc_ratio_unique,
        FieldsTrain.wc_diff_unq_stop, FieldsTrain.wc_ratio_unique_stop,
        FieldsTrain.same_start, FieldsTrain.char_diff,
        FieldsTrain.char_diff_unq_stop, FieldsTrain.total_unique_words,
        FieldsTrain.total_unq_words_stop, FieldsTrain.char_ratio,
        FieldsTrain.tfidf_wm, FieldsTrain.tfidf_wm_stops
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    test_df[[
        FieldsTest.test_id, FieldsTest.word_match, FieldsTest.jaccard,
        FieldsTest.wc_diff, FieldsTest.wc_ratio, FieldsTest.wc_diff_unique,
        FieldsTest.wc_ratio_unique, FieldsTest.wc_diff_unq_stop,
        FieldsTest.wc_ratio_unique_stop, FieldsTest.same_start,
        FieldsTest.char_diff, FieldsTest.char_diff_unq_stop,
        FieldsTest.total_unique_words, FieldsTest.total_unq_words_stop,
        FieldsTest.char_ratio, FieldsTest.tfidf_wm, FieldsTest.tfidf_wm_stops
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
示例#7
0
def main(conf):
    dump_dir = conf['exploration.dump.dir']
    makedirs(dump_dir)

    notebook_file = join_path(dump_dir, conf['exploration.dump.notebook'])
    notebook_cells = []

    images_dir = join_path(dump_dir, conf['exploration.dump.images.dir'])
    makedirs(images_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df()
    y = train_df[[FieldsTrain.is_duplicate]].values.flatten()

    logging.info('Loading test dataset')
    test_df = load_test_df()

    logging.info('Loading features')
    features = []
    for group, cnf in conf['features'].iteritems():
        logging.info('Loading features group: %s', group)

        features_dump_dir = cnf['dump']
        train_features_file = join_path(features_dump_dir, 'train.csv')
        test_features_file = join_path(features_dump_dir, 'test.csv')

        train_features = pd.read_csv(train_features_file)
        test_features = pd.read_csv(test_features_file)

        for fcnf in cnf['features']:
            feature = fcnf['feature']
            features.append(feature)
            train_col = fcnf.get('train_col', feature)
            test_col = fcnf.get('test_col', feature)
            train_df[feature] = train_features[train_col]
            test_df[feature] = test_features[test_col]

    figure = plt.figure(1, figsize=[8, 6])

    for feature in features:
        logging.info('Feature: %s', feature)
        train_stats = train_df[[feature]].describe()
        test_stats = test_df[[feature]].describe()

        cell = new_markdown_cell("# %s" % feature)
        notebook_cells.append(cell)

        sns.distplot(train_df[[feature]])
        sns.distplot(test_df[[feature]])

        image_file = join_path(images_dir, 'hist_%s.png' % feature)
        figure.savefig(image_file)

        plt.cla()

    nb = new_notebook(cells=notebook_cells)
    with open(notebook_file, 'w') as fh:
        nb_write(nb, fh)
def main(conf):
    dump_dir = conf['mephistopheies.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['mephistopheies.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['mephistopheies.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        FieldsTrain.unigram_all_jaccard,
        FieldsTrain.unigram_all_jaccard_max,
        FieldsTrain.bigram_all_jaccard,
        FieldsTrain.bigram_all_jaccard_max,
        FieldsTrain.trigram_all_jaccard,
        FieldsTrain.trigram_all_jaccard_max,
        FieldsTrain.trigram_tfidf_cosine,
        FieldsTrain.trigram_tfidf_l2_euclidean,
        FieldsTrain.m_q1_q2_tf_svd0,
        FieldsTrain.m_q1_q2_tf_svd1,
        FieldsTrain.m_q1_q2_tf_svd2,
        #FieldsTrain.m_w1l_tfidf_oof
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        FieldsTest.unigram_all_jaccard,
        FieldsTest.unigram_all_jaccard_max,
        FieldsTest.bigram_all_jaccard,
        FieldsTest.bigram_all_jaccard_max,
        FieldsTest.trigram_all_jaccard,
        FieldsTest.trigram_all_jaccard_max,
        FieldsTest.trigram_tfidf_cosine,
        FieldsTest.trigram_tfidf_l2_euclidean,
        FieldsTest.m_q1_q2_tf_svd0,
        FieldsTest.m_q1_q2_tf_svd1,
        FieldsTest.m_q1_q2_tf_svd2,
        #FieldsTest.m_w1l_tfidf_oof
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    dump_dir = conf['tfidf.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['tfidf.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['tfidf.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, Fields.m_q1_q2_tf_svd0
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[FieldsTest.test_id,
             Fields.m_q1_q2_tf_svd0]].to_csv(join_path(dump_dir, 'test.csv'),
                                             index=False)
示例#10
0
def main(conf):
    dump_dir = conf['fuzzy.dump.dir']
    makedirs(dump_dir)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['fuzzy.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['fuzzy.dataset'])

    compute_features(train_df, test_df)

    logging.info('Writing train dataset to disk')
    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Writing test dataset to disk')
    test_df[[
        FieldsTest.test_id,
        Fields.qratio,
        Fields.wratio,
        Fields.partial_ratio,
        Fields.partial_token_set_ratio,
        Fields.partial_token_sort_ratio,
        Fields.token_set_ratio,
        Fields.token_sort_ratio
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    dump_dir = conf['svdff.dump.dir']
    makedirs(dump_dir)

    dump_config_file = join_path(dump_dir, 'application.conf')
    dump_config(conf, dump_config_file)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['svdff.dataset'])

    y = train_df['is_duplicate'].values

    vectorizer_file = join_path(dump_dir, 'vectorizer.pkl')
    try:
        logging.info('Loading vectorizer dump')
        vectorizer = joblib.load(vectorizer_file)
    except:
        logging.info('Loading vectorizer dump failed')
        logging.info('Traininig vectorizer')
        vectorizer = train_vectorizer(train_df, **conf['svdff.vectorizer'])

        logging.info('Writing vectorizer dump')
        joblib.dump(vectorizer, vectorizer_file)

    features_file = join_path(dump_dir, 'features_train.npz')
    logging.info('Loading cached train feature matrix from %s', features_file)
    X = load_feature_matrix(features_file)

    if X is None:
        logging.info('Unable to load cached train feature matrix')

        logging.info('Computing train feature matrix')
        X = compute_feature_matrix(train_df, vectorizer, combine='stack')

        logging.info('Writing train feature matrix to %s', features_file)
        save_feature_matrix(X, features_file)

    logging.info('Loading SVD decomposition')
    k = conf['svdff.svd'].get_int('k')
    singular_values_file = join_path(dump_dir, 'singular_values.txt')
    singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz')
    try:
        S = np.loadtxt(singular_values_file)
        VT = np.load(singular_vectors_file)['VT']
        assert k == len(S)
    except:
        logging.info('Loading SVD decomposition failed')
        logging.info('Computing SVD decomposition')
        S, VT = compute_svd(X.asfptype(), **conf['svdff.svd'])

        logging.info('Writing singular values to file')
        np.savetxt(singular_values_file, S)
        np.savez(singular_vectors_file, VT=VT)
        plot_singular_values(S, dump_dir)

    logging.info('Computing train SVD features')
    Sinv = np.diag(1. / S) * np.sqrt(X.shape[0])
    U = X.dot(VT.transpose().dot(Sinv))

    logging.info('Train feature matrix dimensions: %s', U.shape)

    logging.info('Symmetrizing input features')
    Uq1, Uq2 = np.vsplit(U, 2)
    U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0])

    logging.info('Training feature matrix: %s', U.shape)

    logging.info('Training feed-forward neural networks')
    quality, predictions = train_ff(U,
                                    y,
                                    skfold(),
                                    dump_dir=dump_dir,
                                    **conf['svdff.ff'])

    logging.info('Plotting quality metrics')
    quality_dir = join_path(dump_dir, 'quality')
    makedirs(quality_dir)
    for q in quality['folds']:
        img_dir = join_path(quality_dir, 'fold%d' % q['fold'])
        makedirs(img_dir)
        plot_quality(q, img_dir)

    logging.info('Writing train features')
    train_df['svdff'] = predictions

    train_df[[FieldsTrain.id, FieldsTrain.is_duplicate,
              'svdff']].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['svdff.dataset'])

    logging.info('Computing test features')
    X = compute_feature_matrix(test_df, vectorizer, combine='stack')

    logging.info('Computing test SVD features')
    U = X.dot(VT.transpose().dot(Sinv))

    logging.info('Symmetrizing input features')
    Uq1, Uq2 = np.vsplit(U, 2)
    U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0])

    logging.info('Applying models to test dataset')
    test_df['svdff'] = np.zeros(U.shape[0])
    for q in quality['folds']:
        f = load_model(q['dump'])
        p = f.predict_proba(U).flatten()
        test_df['svdff'] = test_df['svdff'] + logit(p)
    test_df['svdff'] = test_df['svdff'] / len(quality['folds'])

    logging.info('Writing test dataset')
    test_df[[
        FieldsTest.test_id,
        'svdff',
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
示例#12
0
def main(conf):
    dump_dir = conf['glove.dump.dir']
    makedirs(dump_dir)

    logging.warning('Loading train dataset')
    train_df = load_train_df(conf['glove.dataset'])

    logging.warning('Loading test dataset')
    test_df = load_test_df(conf['glove.dataset'])

    logging.warning('Loading embeddings')
    embeddings_dir = conf['glove.embeddings.dir']
    embeddings_file = join_path(embeddings_dir, conf['glove.embeddings.file'])
    glove = gensim.models.KeyedVectors.load_word2vec_format(embeddings_file,
                                                            binary=False)
    glove.init_sims(replace=True)
    processor = Glove(glove)

    logging.warning('Computing train features')

    train_df[Fields.glove_wmd], \
    train_df[Fields.glove_cos], \
    train_df[Fields.glove_city], \
    train_df[Fields.glove_jacc], \
    train_df[Fields.glove_canb], \
    train_df[Fields.glove_eucl], \
    train_df[Fields.glove_mink], \
    train_df[Fields.glove_bray], \
    train_df[Fields.glove_skew_q1], \
    train_df[Fields.glove_skew_q2], \
    train_df[Fields.glove_kurt_q1], \
    train_df[Fields.glove_kurt_q2] = \
        zip(*train_df.progress_apply(lambda r: processor.features(r['question1'], r['question2']), axis=1))

    for feature in [f for f in dir(Fields()) if f.startswith('glove')]:
        logging.warning(
            'Feature %s AUC=%s', feature,
            roc_auc_score(train_df[FieldsTrain.is_duplicate],
                          train_df[feature]))

    logging.warning('Writing train feature dump')
    train_df.drop([
        Fields.question1, Fields.question2, FieldsTrain.qid1, FieldsTrain.qid2
    ],
                  axis=1,
                  inplace=True)
    train_df.to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.warning('Computing test features')
    test_df[Fields.glove_wmd], \
    test_df[Fields.glove_cos], \
    test_df[Fields.glove_city], \
    test_df[Fields.glove_jacc], \
    test_df[Fields.glove_canb], \
    test_df[Fields.glove_eucl], \
    test_df[Fields.glove_mink], \
    test_df[Fields.glove_bray], \
    test_df[Fields.glove_skew_q1], \
    test_df[Fields.glove_skew_q2], \
    test_df[Fields.glove_kurt_q1], \
    test_df[Fields.glove_kurt_q2] = \
        zip(*test_df.progress_apply(lambda r: processor.features(r['question1'], r['question2']), axis=1))

    logging.warning('Writing test feature dump')
    test_df.drop([Fields.question1, Fields.question2], axis=1, inplace=True)
    test_df.to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    logging.info('Loading train dataset')
    train_df = load_train_df(conf['dataset_raw'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['dataset_raw'])

    class_weight = {int(c['class']): c['weight'] for c in conf['weights']}

    for w, cnf in conf['linear'].iteritems():
        if not cnf.get_bool('enabled', True):
            continue

        if w == 'dataset':
            continue

        logging.info('Start training linear model: %s', w)

        dump_dir = cnf.get('dump.dir') or '.'
        makedirs(dump_dir)

        config_file = join_path(dump_dir, 'application.conf')
        dump_config(conf, config_file)

        vectorizer_file = join_path(dump_dir, 'vectorizer.pkl')
        quality_file = join_path(dump_dir, 'quality.json')

        y = train_df[FieldsTrain.is_duplicate]

        if cnf['dump.cache.enabled']:
            logging.info('Loading vectorizer')

            try:
                vectorizer = joblib.load(vectorizer_file)
            except:
                logging.info('Unable to load vectorizer')
                vectorizer = None

            if vectorizer is None:
                logging.info('Training vectorizer')

                vectorizer = train_vectorizer(train_df, **cnf['vectorizer'])
                nf = len(vectorizer.vocabulary_)
                logging.info('Feature count: %d', nf)

                logging.info('Dumping vectorizer')
                joblib.dump(vectorizer, vectorizer_file)

            features_cache_file = join_path(dump_dir, cnf['dump.cache.train'])
            logging.info('Loading cached train feature matrix from %s',
                         features_cache_file)
            X = load_feature_matrix(features_cache_file)

            if X is None:
                logging.info('Unable to load cached train feature matrix')

                logging.info('Computing train feature matrix')
                X = compute_feature_matrix(train_df,
                                           vectorizer,
                                           combine=cnf['combine'])

                logging.info('Writing train feature matrix to %s',
                             features_cache_file)
                save_feature_matrix(X, features_cache_file)
        else:
            logging.info('Training vectorizer')
            vectorizer = train_vectorizer(train_df, **cnf['vectorizer'])
            X = compute_feature_matrix(train_df,
                                       vectorizer,
                                       combine=cnf['combine'])
            nf = len(vectorizer.vocabulary_)
            logging.info('Feature count: %d', nf)

        logging.info('Training feature matrix: %s', X.shape)

        quality, predictions = train(X,
                                     y,
                                     skfold(),
                                     class_weight,
                                     dump_dir=dump_dir,
                                     **cnf['model'])

        with open(quality_file, 'w') as qfh:
            json.dump(quality, qfh)

        logging.info('Writing train set to disk')
        train_df[FieldsTrain.linear] = predictions
        train_df[[
            FieldsTrain.id, FieldsTrain.is_duplicate, FieldsTrain.linear
        ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

        if cnf['dump.cache.enabled']:
            features_cache_file = join_path(dump_dir, cnf['dump.cache.test'])

            logging.info('Loading cached test feature matrix from %s',
                         features_cache_file)
            X = load_feature_matrix(features_cache_file)
            if X is None:
                logging.info('Unable to load cached test feature matrix')
                logging.info('Computing test feature matrix')
                X = compute_feature_matrix(test_df,
                                           vectorizer,
                                           combine=cnf['combine'])

                logging.info('Writing test feature matrix to cache')
                save_feature_matrix(X, features_cache_file)
        else:
            logging.info('Computing test feature matrix')
            X = compute_feature_matrix(test_df,
                                       vectorizer,
                                       combine=cnf['combine'])

        logging.info(
            'Computing test predictions as average logit of cross-validation models'
        )
        test_df[FieldsTest.linear_cv] = np.zeros(X.shape[0])
        for fold in quality['folds']:
            f = joblib.load(fold['dump'])
            p = logit(f.predict_proba(X)[:, 1])
            test_df[FieldsTest.linear_cv] = test_df[FieldsTest.linear_cv] + p
        test_df[FieldsTest.linear_cv] = test_df[FieldsTest.linear_cv] / len(
            quality['folds'])

        logging.info('Computing test predictions with full model')
        f = joblib.load(quality['full']['unweighted']['dump'])
        p = logit(f.predict_proba(X)[:, 1])
        test_df[FieldsTest.linear_full] = p

        logging.info('Computing test predictions with full weighted model')
        f = joblib.load(quality['full']['weighted']['dump'])
        p = logit(f.predict_proba(X)[:, 1])
        test_df[FieldsTest.linear_full_weighted] = p

        logging.info('Writing test set to disk')
        test_df[[
            FieldsTest.test_id, FieldsTest.linear_cv, FieldsTest.linear_full,
            FieldsTest.linear_full_weighted
        ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
示例#14
0
def main(conf):
    logging.info('Loading train dataset')
    train_df = load_train_df(conf['svd.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['svd.dataset'])

    for f, cnf in conf['svd'].iteritems():
        if f == 'dataset':
            continue

        if not cnf.get('enabled', True):
            continue

        logging.info('Start traning SVD model %s', f)

        dump_dir = cnf['dump.dir']
        makedirs(dump_dir)
        logging.info('Dump %s', dump_dir)

        vectorizer_file = join_path(dump_dir, 'vectorizer.pkl')
        try:
            logging.info('Loading vectorizer dump')
            vectorizer = joblib.load(vectorizer_file)
        except:
            logging.info('Loading vectorizer dump failed')
            logging.info('Traininig vectorizer: %s', cnf['vectorizer'])
            vectorizer = train_vectorizer(train_df, **cnf['vectorizer'])

            logging.info('Writing vectorizer dump')
            joblib.dump(vectorizer, vectorizer_file)

        train_features_matrix_file = join_path(dump_dir, 'train_features.npz')
        logging.info('Loading train features matrix')
        X = load_feature_matrix(train_features_matrix_file)
        if X is None:
            logging.info('Loading train feature matrix failed')
            logging.info('Computing train feature matrix')
            X = compute_feature_matrix(train_df, vectorizer, combine=cnf.get('model.transform', None))

            logging.info('Writing train feature matrix dump')
            save_feature_matrix(X, train_features_matrix_file)

        logging.info('Computing SVD decomposition')
        ksvd = cnf['model'].get_int('k')
        S, VT = compute_svd(X.asfptype(), **cnf['model'])
        Sinv = np.diag(1. / S) * np.sqrt(X.shape[0])
        logging.info('Singular values %s', S)

        logging.info('Computing train SVD features')
        U = X.dot(VT.transpose()).dot(Sinv)
        logging.info('Train features variance: %s', np.var(U, axis=0))

        features = map(lambda i: f + '_%d' % i, range(U.shape[1]))
        if cnf.get('model.transform', None) == 'stack':
            features_q1 = map(lambda s: s + '_q1', features)
            features_q2 = map(lambda s: s + '_q2', features)
            features = features_q1 + features_q2
            train_features_df_q1 = pd.DataFrame(U[:train_df.shape[0], :], columns=features_q1)
            train_features_df_q2 = pd.DataFrame(U[train_df.shape[0]:, :], columns=features_q2)
            train_df = pd.concat([train_df, train_features_df_q1, train_features_df_q2], axis=1)

            train_df['svd_dist_eucl'] = train_df.apply(lambda r: compute_svd_distance_eucl(r, f, ksvd), axis=1)
            features.append('svd_dist_eucl')
        else:
            train_features_df = pd.DataFrame(U, columns=features)
            train_df = pd.concat([train_df, train_features_df], axis=1)

        for feature in features:
            logging.info('Feature %s AUC=%s', feature, roc_auc_score(train_df[FieldsTrain.is_duplicate], train_df[feature]))

        logging.info('Writing train features dump')
        train_file = join_path(dump_dir, 'train.csv')
        train_df[[FieldsTrain.id, FieldsTrain.is_duplicate] + features].to_csv(train_file, index=False)

        test_features_matrix_file = join_path(dump_dir, 'test_features.npz')
        logging.info('Loading test features matrix')
        X = load_feature_matrix(test_features_matrix_file)
        if X is None:
            logging.info('Loading test feature matrix failed')
            logging.info('Computing test feature matrix')
            X = compute_feature_matrix(test_df, vectorizer, combine=cnf.get('model.transform', None))

            logging.info('Writing test feature matrix dump')
            save_feature_matrix(X, test_features_matrix_file)

        U = X.dot(VT.transpose()).dot(Sinv)
        logging.info('Test features variance: %s', np.var(U, axis=0))

        logging.info('Computing test SVD features')
        if cnf.get('model.transform', None) == 'stack':
            logging.info('Computing q1 test SVD features')
            test_features_df_q1 = pd.DataFrame(U[:test_df.shape[0], :], columns=features_q1)
            test_df = pd.concat([test_df, test_features_df_q1], axis=1)
            del test_features_df_q1

            logging.info('Computing q2 test SVD features')
            test_features_df_q2 = pd.DataFrame(U[test_df.shape[0]:, :], columns=features_q2)
            test_df = pd.concat([test_df, test_features_df_q2], axis=1)
            del test_features_df_q2

            logging.info('Computing svd distances')
            test_df['svd_dist_eucl'] = test_df.apply(lambda r: compute_svd_distance_eucl(r, f, ksvd), axis=1)
        else:
            test_features_df = pd.DataFrame(U, columns=features)
            test_df = pd.concat([test_df, test_features_df], axis=1)

        logging.info('Writing test features dump')
        test_file = join_path(dump_dir, 'test.csv')
        test_df[[FieldsTest.test_id] + features].to_csv(test_file, index=False)
def main(conf):
    dump_dir = conf['svdres.dump.dir']
    makedirs(dump_dir)

    dump_config_file = join_path(dump_dir, 'application.conf')
    dump_config(conf, dump_config_file)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['svdres.dataset'])

    vectorizer_file = join_path(dump_dir, 'vectorizer.pkl')
    try:
        logging.info('Loading vectorizer dump')
        vectorizer = joblib.load(vectorizer_file)
    except:
        logging.info('Loading vectorizer dump failed')
        logging.info('Traininig vectorizer')
        vectorizer = train_vectorizer(train_df, **conf['svdres.vectorizer'])

        logging.info('Writing vectorizer dump')
        joblib.dump(vectorizer, vectorizer_file)

    features_file = join_path(dump_dir, 'features_train.npz')
    logging.info('Loading cached train feature matrix from %s', features_file)
    X = load_feature_matrix(features_file)

    if X is None:
        logging.info('Unable to load cached train feature matrix')

        logging.info('Computing train feature matrix')
        X = compute_feature_matrix(train_df, vectorizer, combine='stack')

        logging.info('Writing train feature matrix to %s', features_file)
        save_feature_matrix(X, features_file)

    logging.info('Loading SVD decomposition')
    k = conf['svdres.svd'].get_int('k')
    singular_values_file = join_path(dump_dir, 'singular_values.txt')
    singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz')
    try:
        S = np.loadtxt(singular_values_file)
        VT = np.load(singular_vectors_file)['VT']
        assert k == len(S)
    except:
        logging.info('Loading SVD decomposition failed')
        logging.info('Computing SVD decomposition')
        S, VT = compute_svd(X.asfptype(), **conf['svdres.svd'])

        logging.info('Writing singular values to file')
        np.savetxt(singular_values_file, S)
        np.savez(singular_vectors_file, VT=VT)

    logging.info('Train matrix %s', X.shape)
    logging.info('Computing train SVD residuals')
    L = X.shape[0] / 2
    Xq1 = X[:L, :]
    Xq2 = X[L:, :]

    start = 0
    batch = 100
    eucl = np.zeros(Xq1.shape[0])
    cos = np.zeros(Xq1.shape[0])
    q1res = np.zeros(Xq1.shape[0])
    q2res = np.zeros(Xq1.shape[0])
    while start < Xq1.shape[0]:
        finish = min(start + batch, Xq1.shape[0])

        Xq1_batch = Xq1[start:finish, :]
        nq1 = (Xq1_batch.multiply(Xq1_batch)).sum(axis=1).flatten()

        Rq1 = safe_sparse_dot(Xq1_batch, VT.transpose()).dot(VT) - Xq1_batch
        nrq1 = np.sum(np.multiply(Rq1, Rq1), axis=1).flatten()

        Xq2_batch = Xq2[start:finish, :]
        nq2 = (Xq2_batch.multiply(Xq2_batch)).sum(axis=1).flatten()

        Rq2 = safe_sparse_dot(Xq2_batch, VT.transpose()).dot(VT) - Xq2_batch
        nrq2 = np.sum(np.multiply(Rq2, Rq2), axis=1).flatten()

        q1res[start:finish] = np.sqrt(nrq1) / np.sqrt(nq1)
        q2res[start:finish] = np.sqrt(nrq2) / np.sqrt(nq2)
        eucl[start:finish] = euclidean(Rq1, Rq2).flatten()
        cos[start:finish] = cosine(Rq1, Rq2).flatten()

        start = finish

    train_df['svd_res_q1'] = q1res
    train_df['svd_res_q2'] = q2res
    train_df['svd_res_eucl'] = eucl
    train_df['svd_res_cos'] = cos

    train_df[[
        FieldsTrain.id, FieldsTrain.is_duplicate, 'svd_res_q1', 'svd_res_q2',
        'svd_res_eucl', 'svd_res_cos'
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['svddist.dataset'])

    logging.info('Computing test features')
    X = compute_feature_matrix(test_df, vectorizer, combine='stack')

    logging.info('Computing train SVD residuals')
    L = X.shape[0] / 2
    Xq1 = X[:L, :]
    Xq2 = X[L:, :]

    start = 0
    batch = 100
    eucl = np.zeros(Xq1.shape[0])
    cos = np.zeros(Xq1.shape[0])
    q1res = np.zeros(Xq1.shape[0])
    q2res = np.zeros(Xq1.shape[0])
    while start < Xq1.shape[0]:
        finish = min(start + batch, Xq1.shape[0])

        Xq1_batch = Xq1[start:finish, :]
        nq1 = (Xq1_batch.multiply(Xq1_batch)).sum(axis=1).flatten()

        Rq1 = safe_sparse_dot(Xq1_batch, VT.transpose()).dot(VT) - Xq1_batch
        nrq1 = np.sum(np.multiply(Rq1, Rq1), axis=1).flatten()

        Xq2_batch = Xq2[start:finish, :]
        nq2 = (Xq2_batch.multiply(Xq2_batch)).sum(axis=1).flatten()

        Rq2 = safe_sparse_dot(Xq2_batch, VT.transpose()).dot(VT) - Xq2_batch
        nrq2 = np.sum(np.multiply(Rq2, Rq2), axis=1).flatten()

        q1res[start:finish] = np.sqrt(nrq1) / np.sqrt(nq1)
        q2res[start:finish] = np.sqrt(nrq2) / np.sqrt(nq2)
        eucl[start:finish] = euclidean(Rq1, Rq2).flatten()
        cos[start:finish] = cosine(Rq1, Rq2).flatten()

        start = finish

    test_df['svd_res_q1'] = q1res
    test_df['svd_res_q2'] = q2res
    test_df['svd_res_eucl'] = eucl
    test_df['svd_res_cos'] = cos

    logging.info('Writing test dataset')
    test_df[[
        FieldsTest.test_id, 'svd_res_q1', 'svd_res_q2', 'svd_res_eucl',
        'svd_res_cos'
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
def main(conf):
    dump_dir = conf['xgboost.dump.dir']
    makedirs(dump_dir)

    dump_config_file = join_path(dump_dir, 'application.conf')
    dump_config(conf, dump_config_file)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['xgboost.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['xgboost.dataset'])

    logging.info('Loading features')
    features = []
    for group, cnf in conf['features'].iteritems():
        logging.info('Loading features group: %s', group)

        features_dump_dir = cnf['dump']
        train_features_file = join_path(features_dump_dir, 'train.csv')
        test_features_file = join_path(features_dump_dir, 'test.csv')

        train_features = pd.read_csv(train_features_file)
        test_features = pd.read_csv(test_features_file)

        for fcnf in cnf['features']:
            feature = fcnf['feature']
            features.append(feature)
            train_col = fcnf.get('train_col', feature)
            test_col = fcnf.get('test_col', feature)
            train_df[feature] = train_features[train_col]
            test_df[feature] = test_features[test_col]

    feature_map_file = join_path(dump_dir, 'xgb.fmap')
    create_feature_map(features, feature_map_file)

    train_df_flipped = train_df.copy()
    for flip in conf['flip']:
        train_df_flipped[flip[0]] = train_df[[flip[1]]]
        train_df_flipped[flip[1]] = train_df[[flip[0]]]

    train_df = pd.concat([train_df, train_df_flipped], axis=0, ignore_index=True)
    logging.info('Train dataset: %s', train_df.shape)

    y = train_df[[FieldsTrain.is_duplicate]].values.flatten()
    logging.info('Train dataset CTR: %s', y.sum() / len(y))

    class_weight = {int(c['class']): c['weight'] for c in conf['weights']}
    w = np.vectorize(class_weight.get)(y)
    logging.info('Train dataset weighted CTR: %s', sum(y * w) / sum(w))

    q1 = train_df[Fields.question1].values
    q2 = train_df[Fields.question2].values

    train_df.drop([
        FieldsTrain.id,
        FieldsTrain.qid1,
        FieldsTrain.qid2,
        FieldsTrain.question1,
        FieldsTrain.question2,
        FieldsTrain.is_duplicate], axis=1, inplace=True)

    X = train_df.values

    logging.info('Training XGBoost model')
    model, progress, quality = train_xgboost(X, y, w, **conf['xgboost.param'])

    logging.info('Writing model dump')
    model_dump_file = join_path(dump_dir, 'model_dump.txt')
    model.dump_model(model_dump_file, fmap=feature_map_file, with_stats=True)
    model_file = join_path(dump_dir, 'model.bin')
    model.save_model(model_file)

    logging.info('Writing quality')
    # plot_quality(quality, dump_dir)

    logging.info('Writing top errors')
    errors_file = join_path(dump_dir, 'errors.csv')
    with open(errors_file, 'w') as fh:
        fh.write('y,p,question1,question2,sample\n')
        for e in quality['errors']['train']['type_i']:
            fh.write('%d,%s,%s,%s,%s\n' % (0, e[0], q1[e[1]], q2[e[1]], 'train'))
        for e in quality['errors']['train']['type_ii']:
            fh.write('%d,%s,%s,%s,%s\n' % (1, e[0], q1[e[1]], q2[e[1]], 'train'))
        for e in quality['errors']['valid']['type_i']:
            fh.write('%d,%s,%s,%s,%s\n' % (0, e[0], q1[e[1]], q2[e[1]], 'valid'))
        for e in quality['errors']['valid']['type_ii']:
            fh.write('%d,%s,%s,%s,%s\n' % (1, e[0], q1[e[1]], q2[e[1]], 'valid'))

    logging.info('Writing progress file')
    # plot_progress(progress, dump_dir)
    progress_file = join_path(dump_dir, 'progress.json')
    with open(progress_file, 'w') as fh:
        json.dump(progress, fh)

    logging.info('Writing feature scores')
    score_weight = model.get_score(fmap=feature_map_file, importance_type='weight')
    score_gain = model.get_score(fmap=feature_map_file, importance_type='gain')
    score_cover = model.get_score(fmap=feature_map_file, importance_type='cover')
    split_histograms = dict()
    for f in features:
        split_histograms[f] = model.get_split_value_histogram(f, fmap=feature_map_file)

    scores = pd.DataFrame([score_weight, score_gain, score_cover]).transpose()
    scores.index.name = 'feature'
    scores.rename(columns={0: 'weight', 1: 'gain', 2: 'cover'}, inplace=True)
    weight_total = scores['weight'].sum()
    scores['weight'] = scores['weight'] / weight_total
    scores.sort_values(by='weight', ascending=False, inplace=True)
    scores.to_csv(join_path(dump_dir, 'feature_scores.csv'))

    logging.info('Computing test predictions')
    test_ids = test_df[[FieldsTest.test_id]]
    test_df.drop([FieldsTest.test_id, FieldsTest.question1, FieldsTest.question2], axis=1, inplace=True)
    dtest = xgb.DMatrix(test_df.values)
    p_test = model.predict(dtest)

    logging.info('Writing submission file')
    submission_file = join_path(dump_dir, 'submission.csv')
    submission(submission_file, test_ids, p_test)
示例#17
0
def main(conf):
    dump_dir = conf['svddist.dump.dir']
    makedirs(dump_dir)

    dump_config_file = join_path(dump_dir, 'application.conf')
    dump_config(conf, dump_config_file)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['svddist.dataset'])

    vectorizer_file = join_path(dump_dir, 'vectorizer.pkl')
    try:
        logging.info('Loading vectorizer dump')
        vectorizer = joblib.load(vectorizer_file)
    except:
        logging.info('Loading vectorizer dump failed')
        logging.info('Traininig vectorizer')
        vectorizer = train_vectorizer(train_df, **conf['svddist.vectorizer'])

        logging.info('Writing vectorizer dump')
        joblib.dump(vectorizer, vectorizer_file)

    features_file = join_path(dump_dir, 'features_train.npz')
    logging.info('Loading cached train feature matrix from %s', features_file)
    X = load_feature_matrix(features_file)

    if X is None:
        logging.info('Unable to load cached train feature matrix')

        logging.info('Computing train feature matrix')
        X = compute_feature_matrix(train_df, vectorizer, combine='stack')

        logging.info('Writing train feature matrix to %s', features_file)
        save_feature_matrix(X, features_file)

    logging.info('Loading SVD decomposition')
    k = conf['svddist.svd'].get_int('k')
    singular_values_file = join_path(dump_dir, 'singular_values.txt')
    singular_vectors_file = join_path(dump_dir, 'singular_vectors.npz')
    try:
        S = np.loadtxt(singular_values_file)
        VT = np.load(singular_vectors_file)['VT']
        assert k == len(S)
    except:
        logging.info('Loading SVD decomposition failed')
        logging.info('Computing SVD decomposition')
        S, VT = compute_svd(X.asfptype(), **conf['svddist.svd'])

        logging.info('Writing singular values to file')
        np.savetxt(singular_values_file, S)
        np.savez(singular_vectors_file, VT=VT)

    logging.info('Computing train SVD features')
    Sinv = np.diag(1. / S) * np.sqrt(X.shape[0])
    U = X.dot(VT.transpose().dot(Sinv))

    logging.info('Train feature matrix dimensions: %s', U.shape)

    logging.info('Symmetrizing input features')
    Uq1, Uq2 = np.vsplit(U, 2)
    del U

    logging.info('Computing euclidean')
    train_df['svd_eucl'] = euclidean(Uq1, Uq2)

    logging.info('Computing cosine')
    train_df['svd_cosine'] = cosine(Uq1, Uq2)
    del Uq1, Uq2

    train_df[[
        FieldsTrain.id,
        FieldsTrain.is_duplicate,
        'svd_eucl',
        'svd_cosine'
    ]].to_csv(join_path(dump_dir, 'train.csv'), index=False)

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['svddist.dataset'])

    logging.info('Computing test features')
    X = compute_feature_matrix(test_df, vectorizer, combine='stack')

    logging.info('Computing test SVD features')
    U = X.dot(VT.transpose().dot(Sinv))

    logging.info('Symmetrizing input features')
    Uq1, Uq2 = np.vsplit(U, 2)
    del U

    logging.info('Computing test euclidean')
    test_df['svd_eucl'] = euclidean(Uq1, Uq2)

    logging.info('Computing test cosine')
    test_df['svd_cosine'] = cosine(Uq1, Uq2)
    del Uq1, Uq2

    logging.info('Writing test dataset')
    test_df[[
        FieldsTest.test_id,
        'svd_eucl',
        'svd_cosine'
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
示例#18
0
def main(conf):
    dump_dir = conf['xgboost.dump.dir']
    makedirs(dump_dir)

    dump_config_file = join_path(dump_dir, 'application.conf')
    dump_config(conf, dump_config_file)

    logging.info('Loading train dataset')
    train_df = load_train_df(conf['xgboost.dataset'])

    logging.info('Loading test dataset')
    test_df = load_test_df(conf['xgboost.dataset'])

    logging.info('Loading features')
    features = []
    for group, cnf in conf['features'].iteritems():
        logging.info('Loading features group: %s', group)

        features_dump_dir = cnf['dump']
        train_features_file = join_path(features_dump_dir, 'train.csv')
        test_features_file = join_path(features_dump_dir, 'test.csv')

        train_features = pd.read_csv(train_features_file)
        test_features = pd.read_csv(test_features_file)

        for fcnf in cnf['features']:
            feature = fcnf['feature']
            features.append(feature)
            train_col = fcnf.get('train_col', feature)
            test_col = fcnf.get('test_col', feature)
            train_df[feature] = train_features[train_col]
            test_df[feature] = test_features[test_col]

    feature_map_file = join_path(dump_dir, 'xgb.fmap')
    create_feature_map(features, feature_map_file)

    train_df_flipped = train_df.copy()
    for flip in conf['flip']:
        train_df_flipped[flip[0]] = train_df[[flip[1]]]
        train_df_flipped[flip[1]] = train_df[[flip[0]]]

    train_df = pd.concat([train_df, train_df_flipped],
                         axis=0,
                         ignore_index=True)
    logging.info('Train dataset: %s', train_df.shape)

    y = train_df[[FieldsTrain.is_duplicate]].values.flatten()
    logging.info('Train dataset CTR: %s', y.sum() / len(y))

    class_weight = {int(c['class']): c['weight'] for c in conf['weights']}
    w = np.vectorize(class_weight.get)(y)
    logging.info('Train dataset weighted CTR: %s', sum(y * w) / sum(w))

    q1 = train_df[Fields.question1].values
    q2 = train_df[Fields.question2].values

    train_df.drop([
        FieldsTrain.id, FieldsTrain.qid1, FieldsTrain.qid2,
        FieldsTrain.question1, FieldsTrain.question2, FieldsTrain.is_duplicate
    ],
                  axis=1,
                  inplace=True)

    logging.info('Computing test predictions')
    test_ids = test_df[[FieldsTest.test_id]]
    test_df.drop(
        [FieldsTest.test_id, FieldsTest.question1, FieldsTest.question2],
        axis=1,
        inplace=True)
    dtest = xgb.DMatrix(test_df.values)

    model = xgb.Booster({'nthread': 4})
    model.load_model(join_path(dump_dir, 'model.bin'))
    p_test = model.predict(dtest)

    logging.info('Writing submission file')
    submission_file = join_path(dump_dir, 'submission.csv')
    submission(submission_file, test_ids, p_test)