示例#1
0
def load_features():
    # 待预测订单的数据 (原始训练集和测试集)
    train = pd.read_csv(Configure.train_data_file, encoding='utf8')
    test = pd.read_csv(Configure.test_data_file, encoding='utf8')
    train['id'] = np.arange(train.shape[0])
    test['id'] = np.arange(test.shape[0])

    # 加载特征, 并合并
    features_merged_dict = Configure.features
    for feature_name in Configure.features:
        print 'merge features:', feature_name
        train_feature, test_feature = data_utils.load_features(feature_name)
        if 'label' in train_feature.columns:
            del train_feature['label']

        train = pd.merge(train, train_feature,
                         on=features_merged_dict[feature_name]['on'],
                         how=features_merged_dict[feature_name]['how'])
        test = pd.merge(test, test_feature,
                        on=features_merged_dict[feature_name]['on'],
                        how=features_merged_dict[feature_name]['how'])

    train.fillna(0, inplace=True)
    test.fillna(0, inplace=True)

    train.drop(['id', 'q1', 'q2', 'q1_words', 'q1_chars', 'q2_words', 'q2_chars', 'label'], axis=1, inplace=True)
    test.drop(['id', 'q1', 'q2', 'q1_words', 'q1_chars', 'q2_words', 'q2_chars',], axis=1, inplace=True)

    return train, test
示例#2
0
n_classes = 5
classNames = {0: 'Disc', 1: 'Spiral', 2: 'Elliptical', 3: 'Round', 4: 'Other'}

handler = data_utils.data_handler(data_dir_path,
                                  sample_fractions=sample_fractions,
                                  input_size=input_size,
                                  labels_type='classes',
                                  output_size=output_size,
                                  normalize_input=False,
                                  create_samples_bool=False,
                                  preprocess_bool=False,
                                  crp_factor=2,
                                  ds_factor=3)

### Load data
X_train, y_train = data_utils.load_features(handler, 'training')
X_val, y_val = data_utils.load_features(handler, 'validation')

### Train an one vs. all Logistic Regression model
param_grid = {"n_neighbors": np.arange(1, 31, 2)}

knn = KNeighborsClassifier(metric="euclidean")
clf = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10)

t0 = time()
clf = clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print(clf.best_estimator_)

acc_score = [x[1] for x in clf.grid_scores_]
示例#3
0
    parser.add_argument('--test',
                        type=float,
                        help='Percentage (or number) of test instances.',
                        required=True)
    parser.add_argument('--seed',
                        type=int,
                        help='PRNG seed (default: 0).',
                        required=False,
                        default=0)
    parser.add_argument('--out',
                        type=str,
                        help='Output file (.json).',
                        required=True)
    args = parser.parse_args()

    X, Y, _, Npages, Nloads = load_features(args.features)

    log('Seed is {}'.format(args.seed))

    n = len(X)
    # Get training/test set size
    if args.train > 1:
        train_size = int(args.train)
    else:
        train_size = int(args.train * n)
    if args.test > 1:
        test_size = int(args.test)
    else:
        test_size = int(args.test * n)

    log('Training set size: {}. Test set size: {}.'.format(