예제 #1
0
def merge_datasets(train, test, slices):
    y_train = feature_utils.get_label(train)


    train_index = pd.Index(train['TERMINALNO'])
    test_index = pd.Index(test['TERMINALNO'])
    train_index = train_index.unique()
    test_index = test_index.unique()

    # 取出训练与测试集中的用户列
    train_data = pd.DataFrame(train_index, index=train_index)
    test_data = pd.DataFrame(test_index, index=test_index)

    # 加载记载在configure列表中的特征,并且合并
    for feature_name in Configure.features:
        # 将不同块的特征concat起来
        print('pd merge', feature_name)
        train_feature, test_feature = data_utils.load_features(feature_name, slices)

        print(train_feature.shape, test_feature.shape)
        train_data = pd.merge(train_data, train_feature,
                         left_index=True,
                         right_index=True, how='left')
        test_data = pd.merge(test_data, test_feature,
                        left_index=True,
                        right_index=True, how='left')
    print(train_data.shape, test_data.shape, y_train.shape, test_index.shape)


    return train_data, test_data, y_train, test_index
예제 #2
0
def merge_datasets():
    train_data_id_label=pd.read_csv(Configure.train_label_path, encoding='ANSI')
    test_data=pd.read_csv(Configure.test_vid_path,encoding='ANSI',usecols=['vid'])
    train_data=train_data_id_label[train_data_id_label.columns[0]]
    y_train = feature_utils.get_label(train_data_id_label)

    print(train_data,y_train,test_data)

    #
    # train_index = pd.Index(train['TERMINALNO'])
    # test_index = pd.Index(test['TERMINALNO'])
    # train_index = train_index.unique()
    # test_index = test_index.unique()
    #
    # # 取出训练与测试集中的用户列
    # train = pd.DataFrame(train_index, index=train_index)
    # test = pd.DataFrame(test_index, index=test_index)

    # 加载记载在configure列表中的特征,并且合并
    for feature_name in Configure.features:
        print('pd merge', feature_name)
        train_feature, test_feature = data_utils.load_features(feature_name)
        train_data = pd.merge(train_data, train_feature,
                         left_index=True,
                         right_index=True)
        test_data = pd.merge(test_data, test_feature,
                        left_index=True,
                        right_index=True)

    return  train_data,test_data,y_train
예제 #3
0
def load_train_test():
    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv',
                        encoding='utf8',
                        dtype={"userid": str})
    test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv',
                       encoding='utf8',
                       dtype={"userid": str})

    # 加载特征, 并合并
    features_merged_dict = Configure.features
    for feature_name in Configure.features:
        print('pd merge', feature_name)
        train_feature, test_feature = data_utils.load_features(feature_name)
        train_feature['userid'] = train_feature['userid'].astype(str)
        test_feature['userid'] = test_feature['userid'].astype(str)
        train = pd.merge(train,
                         train_feature,
                         on=features_merged_dict[feature_name]['on'],
                         how=features_merged_dict[feature_name]['how'])
        test = pd.merge(test,
                        test_feature,
                        on=features_merged_dict[feature_name]['on'],
                        how=features_merged_dict[feature_name]['how'])

    # # 过采样处理样本不均衡
    # pos_train = train[train['orderType'] == 1]
    # neg_train = train[train['orderType'] == 0]
    # print('train, ordertype1: ', pos_train.shape[0], ', ordertype0: ', neg_train.shape[0], ', 1:0 = ', 1.0 * pos_train.shape[0] / neg_train.shape[0])
    #
    # sample_pos_size = int(pos_train.shape[0] * 0.05)
    # sample_pos_train = pos_train.sample(sample_pos_size, random_state=42)
    # train = pd.concat([neg_train, pos_train, sample_pos_train])
    # pos_train = train[train['orderType'] == 1]
    # print('train, ordertype1: ', pos_train.shape[0], ', ordertype0: ', neg_train.shape[0], ', 1:0 = ', 1.0 * pos_train.shape[0] / neg_train.shape[0])

    train.drop(['gender', 'province', 'age', 'has_history_flag'],
               axis=1,
               inplace=True)
    test.drop(['gender', 'province', 'age', 'has_history_flag'],
              axis=1,
              inplace=True)

    # # 去掉 importance 很低的特征
    # droped_features = ['user_rating_std']
    # train.drop(droped_features, axis=1, inplace=True)
    # test.drop(droped_features, axis=1, inplace=True)

    print('特征组合')
    train, test = feature_interaction(train, test)

    print('连续特征离散化')
    train, test = discretize_features(train, test)

    return train, test
예제 #4
0
def load_datasets():
    print('load baseline features')
    train, test = load_0_97210_datasets()

    # 这些特征 和 性能更好的 history_order_type_sum_lg0 存在共线性
    # train.drop(['2016_2017_first_last_ordertype'], axis=1, inplace=True)
    # test.drop(['2016_2017_first_last_ordertype'], axis=1, inplace=True)

    # 加载特征, 并合并
    features_merged_dict = Configure.new_features
    for feature_name in features_merged_dict:
        print('merge', feature_name)
        train_feature, test_feature = data_utils.load_features(feature_name)
        train = pd.merge(train, train_feature,
                         on=features_merged_dict[feature_name]['on'],
                         how=features_merged_dict[feature_name]['how'])
        test = pd.merge(test, test_feature,
                        on=features_merged_dict[feature_name]['on'],
                        how=features_merged_dict[feature_name]['how'])

    # # 按照规则,讲测试集中的类别为1的测试数据添加到训练集中,线上爆炸!
    # sample_pos_test = test[test['history_order_type_sum_lg0'] == 1]
    # sample_pos_test['orderType'] = 1
    # train = pd.concat([train, sample_pos_test], axis=0)

    train.drop(['history_order_type_sum_lg0'], axis=1, inplace=True)
    test.drop(['history_order_type_sum_lg0'], axis=1, inplace=True)

    # train, test = remove_some_features(train, test)

    # with open('train_0.97329.pkl', "wb") as f:
    #     cPickle.dump(train, f, -1)
    # with open('test_0.97329.pkl', "wb") as f:
    #     cPickle.dump(test, f, -1)
    #
    return train, test
예제 #5
0
                        type=str,
                        help='Features directory.',
                        required=True)
    parser.add_argument('--sequences',
                        action='store_true',
                        help='Compute on packet sequences.',
                        required=False,
                        default=False)
    parser.add_argument('--out',
                        type=str,
                        help='Distance file (.distances).',
                        required=True)
    args = parser.parse_args()

    if not args.sequences:
        X, Y, W, _, _ = load_features(args.features)
    else:
        X, Y, W, _, _ = load_dataset(args.features)
        X = packet_sequences_only(X)

    log('Computing pairwise distances')
    D = pairwise_distances(X)
    log('Computing subtractions')

    log('Storing distances into {}'.format(args.out))

    data = {
        'webpage-id': W,
        'label': np.array(Y),
        'pairdist': D,
    }
예제 #6
0
    parser.add_argument('--test',
                        type=float,
                        help='Percentage (or number) of test instances.',
                        required=True)
    parser.add_argument('--seed',
                        type=int,
                        help='PRNG seed (default: 0).',
                        required=False,
                        default=0)
    parser.add_argument('--out',
                        type=str,
                        help='Output file (.json).',
                        required=True)
    args = parser.parse_args()

    X, Y, _, Npages, Nloads = load_features(args.features)

    log('Seed is {}'.format(args.seed))

    n = len(X)
    # Get training/test set size
    if args.train > 1:
        train_size = int(args.train)
    else:
        train_size = int(args.train * n)
    if args.test > 1:
        test_size = int(args.test)
    else:
        test_size = int(args.test * n)

    log('Training set size: {}. Test set size: {}.'.format(