Пример #1
0
def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('lr', clf)]), 'lr_with_fs_0620_02')
Пример #2
0
def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('lr', clf)]),
        'lr_with_fs_0620_02')
Пример #3
0
def main():
    if os.path.isfile(cache_path(_ARGS.name)):
        if _ARGS.name == 'clean':
            # c, d = load_cache('run/' + _ARGS.name)
            # documents = df['tokens'].to_list()
            # dump_cache((c, d, documents), 'run/' + _ARGS.name)
            pipline(None)
            return
        else:
            df = load_cache(_ARGS.name)
    else:
        if _ARGS.name == 'clean':
            dfs = list()
            for i in range(6):
                path = f"./dev/data/clean{i}_covid19.xlsx"
                if os.path.isfile(cache_path(f'clean{i}')):
                    part = load_cache(f'clean{i}')
                else:
                    part = read(path)
                    dump_cache(part, f'clean{i}')
                dfs.append(part)
            df = concat(dfs, ignore_index=True)
        else:
            path = f"./dev/data/{_ARGS.name}_covid19.xlsx"
            df = read(path)

        dump_cache(df, _ARGS.name)
        # logging.disable(level=logging.INFO)
    pipline(df)
    return
Пример #4
0
def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': [
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ]
    }
    grid = GridSearchCV(sgd,
                        param_grid=params,
                        cv=StratifiedKFold(y, 5),
                        scoring='roc_auc',
                        n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
Пример #5
0
def lr():
    """
    Submission: lr_0618.csv
    E_val: <missing>
    E_in: <missing>
    E_out: 0.8119110960575004
    """
    from sklearn.linear_model import LogisticRegressionCV
    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))
    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X, y)
    print(auc_score(clf, X, y))
    to_submission(clf, 'lr_0618_xxx')
Пример #6
0
def lr():
    """
    Submission: lr_0618.csv
    E_val: <missing>
    E_in: <missing>
    E_out: 0.8119110960575004
    """
    from sklearn.linear_model import LogisticRegressionCV
    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))
    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X, y)
    print(auc_score(clf, X, y))
    to_submission(clf, 'lr_0618_xxx')
Пример #7
0
def load_test():
    """
    Load dataset for testing.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features.
    """
    pkl_path = util.cache_path('test_X')
    if os.path.exists(pkl_path):
        X = util.fetch(pkl_path)
    else:
        enroll_set = np.sort(util.load_enrollment_test()['enrollment_id'])
        # log = util.load_logs()
        # base_date = log['time'].max().to_datetime()
        base_date = datetime(2014, 8, 1, 22, 0, 47)
        X = None
        for f in MODELING['features']:
            X_ = f(enroll_set, base_date)
            if X is None:
                X = X_
            else:
                X = np.c_[X, X_]
        util.dump(X, pkl_path)
    return X
Пример #8
0
def load_test():
    """
    Load dataset for testing.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features.
    """
    pkl_path = util.cache_path('test_X')
    if os.path.exists(pkl_path):
        X = util.fetch(pkl_path)
    else:
        enroll_set = np.sort(util.load_enrollment_test()['enrollment_id'])
        # log = util.load_logs()
        # base_date = log['time'].max().to_datetime()
        base_date = datetime(2014, 8, 1, 22, 0, 47)
        X = None
        for f in MODELING['features']:
            X_ = f(enroll_set, base_date)
            if X is None:
                X = X_
            else:
                X = np.c_[X, X_]
        util.dump(X, pkl_path)
    return X
Пример #9
0
def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge',
                 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive',
                 'squared_epsilon_insensitive']
    }
    grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5),
                        scoring='roc_auc', n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('sgd', sgd)]), 'sgd_0620_03')
Пример #10
0
def pipline(data: DataFrame):
    if os.path.isfile(cache_path('run/' + _ARGS.name)):
        corpus, dictionary, documents = load_cache('run/' + _ARGS.name)
    elif data:
        documents = data['tokens'].to_list()
        # Create a dictionary representation of the documents.
        dictionary = Dictionary(documents)

        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=20, no_above=0.5)

        # 去停用词
        bad_ids = [dictionary.token2id[t] for t in STOP_WORDS if t in dictionary.token2id]
        dictionary.filter_tokens(bad_ids=bad_ids)

        # Bag-of-words representation of the documents.
        corpus = [dictionary.doc2bow(doc) for doc in documents]
        dump_cache((corpus, dictionary, documents), 'run/' + _ARGS.name)
    else:
        raise ValueError('cache不存在且未传入data')

    _ = dictionary[0]  # This is only to "load" the dictionary.
    output('Number of unique tokens: ', len(dictionary))
    output('Number of documents: ', len(corpus))
    # test = get_model(6, corpus, dictionary.id2token)

    topic_range = tuple(int(s.strip()) for s in _ARGS.range.split(','))
    kwargs = dict(
        id2word=dictionary.id2token, chunksize=len(corpus),
        passes=_ARGS.passes, alpha='auto', eta='auto', eval_every=1,
        iterations=_ARGS.iterations, random_state=123)
    if len(corpus) < 1e6:  # 并行训练模型
        pool = Pool(_ARGS.pool_size)
        result_dict = dict()
        for k in range(*topic_range):
            result_dict[k] = pool.apply_async(get_model, (corpus, k, kwargs))
        result_dict = {k: v.get() for k, v in result_dict.items()}
        pool.close()  # 等子进程执行完毕后关闭进程池
        pool.join()
        output(f"Searched range{topic_range}")
        # 计算一致性的代码自己有多进程,所以只能串行
        for k, (model, ids) in result_dict.items():
            eval_and_write(data, k, documents, dictionary, corpus, model, ids)
    else:
        # kwargs['alpha'] = 'symmetric'
        kwargs['chunksize'] = len(corpus) // 8 // _ARGS.pool_size + 1
        # kwargs['batch'] = True
        for k in range(*topic_range, 2):  # 大数据就粗点筛
            # model = LdaMulticore(corpus, k, workers=_ARGS.pool_size, **kwargs)
            model = LdaModel(corpus, k, **kwargs)
            ids = save_and_inference(model, corpus, k, kwargs['chunksize'])
            # result_dict[k] = (model, ids)  # 内存不够用啊,4M句子
            eval_and_write(None, k, documents, dictionary, corpus, model, ids)
            del model, ids
            gc.collect()

    output(f"===> {_ARGS.name} compete. \n")
Пример #11
0
def dt():
    """
    Submission: dt_0620_05.csv
    E_val: 0.820972
    E_in: 0.835177
    E_out:
    Comment: {'max_depth': 5}
    """
    from sklearn.tree import DecisionTreeClassifier, export_graphviz

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    dt = DecisionTreeClassifier(max_depth=5, class_weight='auto')
    dt.fit(X, y)

    export_graphviz(dt, 'tree.dot')

    logger.debug('E_in: %f', auc_score(dt, X, y))
    to_submission(dt, 'dt_0620_05')
Пример #12
0
def dt():
    """
    Submission: dt_0620_05.csv
    E_val: 0.820972
    E_in: 0.835177
    E_out:
    Comment: {'max_depth': 5}
    """
    from sklearn.tree import DecisionTreeClassifier, export_graphviz

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    dt = DecisionTreeClassifier(max_depth=5, class_weight='auto')
    dt.fit(X, y)

    export_graphviz(dt, 'tree.dot')

    logger.debug('E_in: %f', auc_score(dt, X, y))
    to_submission(dt, 'dt_0620_05')
Пример #13
0
def lr_with_scale():
    """
    Submission: lr_with_scale_0620_04.csv
    E_val: <missing>
    E_in: 0.857351105162
    E_out: 0.854097855439904
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_scaled, y)
    print(auc_score(clf, X_scaled, y))
    to_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]),
                  'lr_with_scale_0620_04')
Пример #14
0
def lr_with_scale():
    """
    Submission: lr_with_scale_0620_04.csv
    E_val: <missing>
    E_in: 0.857351105162
    E_out: 0.854097855439904
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_scaled, y)
    print(auc_score(clf, X_scaled, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('lr', clf)]), 'lr_with_scale_0620_04')
Пример #15
0
def dropout_history(enrollment_set, base_date):
    X_pkl_path = util.cache_path('dropout_history_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('dropout_history')

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('Dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        logger.debug('load from cache')

        Dropout_count = util.fetch(pkl_path)
    else:
        logger.debug('preparing datasets')

        Enroll_all = util.load_enrollments()

        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log_enroll_ids = pd.DataFrame(np.unique(Log['enrollment_id']),
                                      columns=['enrollment_id'])

        logger.debug('datasets prepared')

        params = []
        enroll_ids = []
        for i, df in Log.groupby(['enrollment_id']):
            params.append(df)
            enroll_ids.append(i)
        pool = par.Pool(processes=min(n_proc, len(params)))
        enroll_dropout_count = dict(
            zip(enroll_ids, pool.map(__get_dropout_feature__, params)))
        pool.close()
        pool.join()

        enroll_dropout_count = pd.Series(enroll_dropout_count,
                                         name='dropout_count')
        enroll_dropout_count.index.name = 'enrollment_id'
        enroll_dropout_count = enroll_dropout_count.reset_index()

        Enroll_counted = pd.merge(Enroll_all,
                                  enroll_dropout_count,
                                  how='left',
                                  on=['enrollment_id'])
        Dropout_count = pd.merge(Log_enroll_ids,
                                 Enroll_counted,
                                 how='left',
                                 on=['enrollment_id'])

        util.dump(Dropout_count, pkl_path)

    Dgb = Dropout_count.groupby('username')
    total_dropout = Dgb.agg({
        'dropout_count': np.sum
    }).reset_index().rename(columns={'dropout_count': 'total_dropout'})
    avg_dropout = Dgb.agg({
        'dropout_count': np.average
    }).reset_index().rename(columns={'dropout_count': 'avg_dropout'})
    drop_courses = Dgb.agg(
        {'dropout_count': lambda x: len([i for i in x if i > 0])})\
        .reset_index().rename(columns={'dropout_count': 'drop_courses'})
    course_count = Dgb.agg({
        'dropout_count': len
    }).reset_index().rename(columns={'dropout_count': 'course_count'})

    Dropout_count = pd.merge(Dropout_count,
                             total_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             avg_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             drop_courses,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             course_count,
                             how='left',
                             on=['username'])

    Dropout_count['drop_ratio'] = (Dropout_count['drop_courses'] /
                                   Dropout_count['course_count'])

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    X = pd.merge(Enroll, Dropout_count, how='left', on=['enrollment_id'])\
        .as_matrix(columns=['dropout_count', 'total_dropout', 'avg_dropout',
                            'drop_courses', 'course_count', 'drop_ratio'])

    logger.debug('dropout history, has nan: %s, shape: %s',
                 np.any(np.isnan(X)), repr(X.shape))

    util.dump(X, X_pkl_path)
    return X
Пример #16
0
        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y


if __name__ == '__main__':
    import glob
    if sys.argv[1] == 'clean':
        cached_files = glob.glob(util.cache_path('train_X*.pkl'))
        cached_files += glob.glob(util.cache_path('train_X*.pklz'))
        cached_files += glob.glob(util.cache_path('train_X*.pkl.gz'))
        cached_files += glob.glob(util.cache_path('train_y*.pkl'))
        cached_files += glob.glob(util.cache_path('train_y*.pklz'))
        cached_files += glob.glob(util.cache_path('train_y*.pkl.gz'))
        cached_files += glob.glob(util.cache_path('test_X*.pkl'))
        cached_files += glob.glob(util.cache_path('test_X*.pklz'))
        cached_files += glob.glob(util.cache_path('test_X*.pkl.gz'))
        for path in cached_files:
            os.remove(path)

    elif sys.argv[1] == 'gen':
        X, y = load_train(cache_only=True)
        print('X.shape: %d x %d' % X.shape)
        print('y.shape: %d' % y.shape)
Пример #17
0
def svc_1():
    """
    Submission: svc_1_0620_01.csv
    E_val: 0.866856950449
    E_in: 0.855948
    E_out: 0.8546898189645258
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFE
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.linear_model import LogisticRegression
    from scipy.stats import expon

    logger.debug('svc_1')

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = RFE(estimator=LogisticRegression(class_weight='auto'),
              step=1,
              n_features_to_select=21)
    rfe.fit(X_scaled, y)
    util.dump(rfe, util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    logger.debug('Features selected.')

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc,
                            n_iter=50,
                            scoring='roc_auc',
                            n_jobs=-1,
                            cv=StratifiedKFold(y, 5),
                            param_distributions={'C': expon()})
    rs.fit(X_new, y)

    logger.debug('Got best SVC.')
    logger.debug('Grid scores: %s', rs.grid_scores_)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('Best params: %s', rs.best_params_)

    svc = rs.best_estimator_
    util.dump(svc, util.cache_path('new_data.SVC'))

    isotonic = CalibratedClassifierCV(svc,
                                      cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_new, y)
    util.dump(isotonic,
              util.cache_path('new_data.CalibratedClassifierCV.isotonic'))

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y))

    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('svc', isotonic)]),
        'svc_1_0620_01')
Пример #18
0
def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y
Пример #19
0
def source_event_counter(enrollment_set, base_date):
    """
    Counts the source-event pairs.

    Features
    --------
    """
    X_pkl_path = util.cache_path('source_event_counter_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('source_event_counter')
    logger.debug('preparing datasets')

    Enroll_all = util.load_enrollments()

    pkl_path = util.cache_path('Log_all_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        Log = util.fetch(pkl_path)
    else:
        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log['source_event'] = Log['source'] + '-' + Log['event']
        Log['day_diff'] = (base_date - Log['time']).dt.days
        Log['week_diff'] = Log['day_diff'] // 7
        Log['event_count'] = 1

        util.dump(Log, pkl_path)

    Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\
        .agg({'event_count': np.sum}).reset_index()

    logger.debug('datasets prepared')

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('event_count_by_eid_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        event_count_by_eid = util.fetch(pkl_path)
    else:
        params = []
        eids = []
        for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\
                .groupby(['enrollment_id']):
            params.append(df)
            eids.append(eid)
        pool = par.Pool(processes=min(n_proc, len(params)))
        event_count_by_eid = dict(
            zip(eids, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(event_count_by_eid, pkl_path)

    X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']])

    logger.debug('source-event pairs counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X0)), repr(X0.shape))

    pkl_path = util.cache_path('D_full_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        D_full = util.fetch(pkl_path)
    else:
        D_full = pd.merge(Enroll_all, Log, on=['enrollment_id'])

        util.dump(D_full, pkl_path)

    pkl_path = util.cache_path('user_wn_courses_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_wn_courses = util.fetch(pkl_path)
    else:
        user_wn_courses = {}
        for u, df in D_full.groupby(['username']):
            x = []
            for wn in __week_span__:
                x.append(len(df[df['week_diff'] == wn]['course_id'].unique()))
            user_wn_courses[u] = x

        util.dump(user_wn_courses, pkl_path)

    X1 = np.array([user_wn_courses[u] for u in Enroll['username']])

    logger.debug('courses by user counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X1)), repr(X1.shape))

    pkl_path = util.cache_path('course_population_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_population = util.fetch(pkl_path)
    else:
        course_population = {}
        for c, df in D_full.groupby(['course_id']):
            course_population[c] = len(df['username'].unique())

        util.dump(course_population, pkl_path)

    X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course population counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X2)), repr(X2.shape))

    pkl_path = util.cache_path('course_dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_dropout_count = util.fetch(pkl_path)
    else:
        course_dropout_count = course_population.copy()
        for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']):
            course_dropout_count[c] -= len(df['username'].unique())

        util.dump(course_dropout_count, pkl_path)

    X3 = np.array(
        [course_dropout_count.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course dropout counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X3)), repr(X3.shape))

    pkl_path = util.cache_path('user_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_ops_count = util.fetch(pkl_path)
    else:
        user_ops_on_all_courses = D_full.groupby(
            ['username', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        users = []
        for u, df in user_ops_on_all_courses.groupby(['username']):
            params.append(df)
            users.append(u)
        pool = par.Pool(processes=min(n_proc, len(params)))
        user_ops_count = dict(
            zip(users, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(user_ops_count, pkl_path)

    X4 = X0 / [user_ops_count[u] for u in Enroll['username']]
    X4[np.isnan(X4)] = 0

    logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X4)), repr(X4.shape))

    pkl_path = util.cache_path('course_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_ops_count = util.fetch(pkl_path)
    else:
        course_ops_of_all_users = D_full.groupby(
            ['course_id', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        courses = []
        for c, df in course_ops_of_all_users.groupby(['course_id']):
            params.append(df)
            courses.append(c)
        pool = par.Pool(processes=min(n_proc, len(params)))
        course_ops_count = dict(
            zip(courses, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(course_ops_count, pkl_path)

    X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']]
    X5[np.isnan(X5)] = 0

    logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s',
                 np.any(np.isnan(X5)), repr(X5.shape))

    X6 = np.array([
        course_dropout_count.get(c, 0) / course_population.get(c, 1)
        for c in Enroll['course_id']
    ])

    logger.debug('dropout ratio of courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X6)), repr(X6.shape))

    Obj = util.load_object()
    Obj = Obj[Obj['start'] <= base_date]
    course_time = {}
    for c, df in Obj.groupby(['course_id']):
        start_time = np.min(df['start'])
        update_time = np.max(df['start'])
        course_time[c] = [(base_date - start_time).days,
                          (base_date - update_time).days]

    avg_start_days = np.average([t[0] for _, t in course_time.items()])
    avg_update_days = np.average([t[1] for _, t in course_time.items()])
    default_case = [avg_start_days, avg_update_days]

    X7 = np.array(
        [course_time.get(c, default_case)[0] for c in Enroll['course_id']])

    logger.debug('days from course first update, has nan: %s, shape: %s',
                 np.any(np.isnan(X7)), repr(X7.shape))

    X8 = np.array(
        [course_time.get(c, default_case)[1] for c in Enroll['course_id']])

    logger.debug('days from course last update, has nan: %s, shape: %s',
                 np.any(np.isnan(X8)), repr(X8.shape))

    user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\
        .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\
        .fillna(0)
    X9 = np.array(user_ops_time['day_diff']['amin'])

    logger.debug('days from user last op, has nan: %s, shape: %s',
                 np.any(np.isnan(X9)), repr(X9.shape))

    X10 = np.array(user_ops_time['day_diff']['amax'])

    logger.debug('days from user first op, has nan: %s, shape: %s',
                 np.any(np.isnan(X10)), repr(X10.shape))

    X11 = X7 - X10

    logger.debug(
        'days from course first update to user first op, has nan: %s'
        ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape))

    X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11]
    util.dump(X, X_pkl_path)

    return X
Пример #20
0
        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y


if __name__ == '__main__':
    import glob
    if sys.argv[1] == 'clean':
        cached_files = glob.glob(util.cache_path('train_X*.pkl'))
        cached_files += glob.glob(util.cache_path('train_X*.pklz'))
        cached_files += glob.glob(util.cache_path('train_X*.pkl.gz'))
        cached_files += glob.glob(util.cache_path('train_y*.pkl'))
        cached_files += glob.glob(util.cache_path('train_y*.pklz'))
        cached_files += glob.glob(util.cache_path('train_y*.pkl.gz'))
        cached_files += glob.glob(util.cache_path('test_X*.pkl'))
        cached_files += glob.glob(util.cache_path('test_X*.pklz'))
        cached_files += glob.glob(util.cache_path('test_X*.pkl.gz'))
        for path in cached_files:
            os.remove(path)

    elif sys.argv[1] == 'gen':
        X, y = load_train(cache_only=True)
        print('X.shape: %d x %d' % X.shape)
        print('y.shape: %d' % y.shape)
Пример #21
0
def read(path) -> DataFrame:
    def _clean(row):
        text = URL_REGEX.sub('', row.contents)
        if row.is_forward and '//@' in text:
            # 如果是转发的且格式正确
            if text.startswith('//@'):
                # 如果单纯转发,则内容设置为最原始微博的内容
                try:
                    text = FORWARD_CONTENT.findall(text)[-1]
                    i = FORWARD_SPLIT.match(text).regs[0][1]
                    text = text[i:]
                except IndexError:
                    text = text.replace('//@', '')  # TODO 可以用weibo的API处理
            else:
                # 否则截取新内容
                text = text[:text.find('//@')]
        return text

    temp_name = os.path.basename(path).replace('.xlsx', '')
    if os.path.isfile(cache_path(temp_name)):
        data, texts = load_cache(temp_name)
    else:
        output(f"===> Reading from <{path}>.")
        data: DataFrame = read_excel(path)  # .iloc[:280]

        # 只保留想要的4列,并去除空值,截取日期
        data = data[['contents', 'time', 'id', 'is_forward']].dropna().reset_index()
        data['date'] = data['time'].apply(lambda s: s[:10])
        data['contents'] = data['contents'].astype(str)

        # 预处理文本
        texts = data.apply(_clean, axis=1).to_list()
        dump_cache((data, texts), temp_name)
    output(f"===> got {len(data)} rows from <{path}>.")

    # 解析GPU ID
    ltp_ids = [i.strip() for i in _ARGS.ltpIDS.split(',')]
    skep_ids = [i.strip() for i in _ARGS.skepIDS.split(',')]

    # 初始化进程池,管理器,数据队列
    pool = Pool(1 + len(ltp_ids) + len(skep_ids))  # 分别分词、获取skep输入、skep运算
    manager = Manager()
    feqture_queue = manager.Queue(16 * len(skep_ids))
    result_queue = manager.Queue(16 * len(skep_ids))

    # 异步任务启动
    pool.apply_async(skep_producer, (feqture_queue, texts, 16, len(skep_ids)))
    tokens = dict()
    for i, (s, p) in zip(ltp_ids, generate_batch(texts, len(texts) // len(ltp_ids) + 1)):
        tokens[(s.start, s.stop)] = pool.apply_async(ltp_tokenzier, (p, 192, i))
    for i in skep_ids:
        pool.apply_async(skep_consumer, (feqture_queue, result_queue, i))

    # 接收结果
    scores, counter = zeros(len(texts)), 1
    while True:
        _slice, array = result_queue.get()
        # print(_slice)
        if array is None:
            if counter < len(skep_ids):
                counter += 1
            else:
                break
        else:
            scores[_slice] = array

    data['tokens'] = None
    for s, t in tokens.items():
        data['tokens'].update(Series(t.get(), range(*s)))
    data['sentiment_score'] = scores
    pool.close()
    pool.join()
    return data[['date', 'tokens', 'id', 'sentiment_score']]
Пример #22
0
def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y
Пример #23
0
def svc_1():
    """
    Submission: svc_1_0620_01.csv
    E_val: 0.866856950449
    E_in: 0.855948
    E_out: 0.8546898189645258
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFE
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.linear_model import LogisticRegression
    from scipy.stats import expon

    logger.debug('svc_1')

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1,
              n_features_to_select=21)
    rfe.fit(X_scaled, y)
    util.dump(rfe, util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    logger.debug('Features selected.')

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5),
                            param_distributions={'C': expon()})
    rs.fit(X_new, y)

    logger.debug('Got best SVC.')
    logger.debug('Grid scores: %s', rs.grid_scores_)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('Best params: %s', rs.best_params_)

    svc = rs.best_estimator_
    util.dump(svc, util.cache_path('new_data.SVC'))

    isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_new, y)
    util.dump(isotonic,
              util.cache_path('new_data.CalibratedClassifierCV.isotonic'))

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y))

    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('svc', isotonic)]), 'svc_1_0620_01')