def load_features(): # 待预测订单的数据 (原始训练集和测试集) train = pd.read_csv(Configure.train_data_file, encoding='utf8') test = pd.read_csv(Configure.test_data_file, encoding='utf8') train['id'] = np.arange(train.shape[0]) test['id'] = np.arange(test.shape[0]) # 加载特征, 并合并 features_merged_dict = Configure.features for feature_name in Configure.features: print 'merge features:', feature_name train_feature, test_feature = data_utils.load_features(feature_name) if 'label' in train_feature.columns: del train_feature['label'] train = pd.merge(train, train_feature, on=features_merged_dict[feature_name]['on'], how=features_merged_dict[feature_name]['how']) test = pd.merge(test, test_feature, on=features_merged_dict[feature_name]['on'], how=features_merged_dict[feature_name]['how']) train.fillna(0, inplace=True) test.fillna(0, inplace=True) train.drop(['id', 'q1', 'q2', 'q1_words', 'q1_chars', 'q2_words', 'q2_chars', 'label'], axis=1, inplace=True) test.drop(['id', 'q1', 'q2', 'q1_words', 'q1_chars', 'q2_words', 'q2_chars',], axis=1, inplace=True) return train, test
n_classes = 5 classNames = {0: 'Disc', 1: 'Spiral', 2: 'Elliptical', 3: 'Round', 4: 'Other'} handler = data_utils.data_handler(data_dir_path, sample_fractions=sample_fractions, input_size=input_size, labels_type='classes', output_size=output_size, normalize_input=False, create_samples_bool=False, preprocess_bool=False, crp_factor=2, ds_factor=3) ### Load data X_train, y_train = data_utils.load_features(handler, 'training') X_val, y_val = data_utils.load_features(handler, 'validation') ### Train an one vs. all Logistic Regression model param_grid = {"n_neighbors": np.arange(1, 31, 2)} knn = KNeighborsClassifier(metric="euclidean") clf = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10) t0 = time() clf = clf.fit(X_train, y_train) print("done in %0.3fs" % (time() - t0)) print(clf.best_estimator_) acc_score = [x[1] for x in clf.grid_scores_]
parser.add_argument('--test', type=float, help='Percentage (or number) of test instances.', required=True) parser.add_argument('--seed', type=int, help='PRNG seed (default: 0).', required=False, default=0) parser.add_argument('--out', type=str, help='Output file (.json).', required=True) args = parser.parse_args() X, Y, _, Npages, Nloads = load_features(args.features) log('Seed is {}'.format(args.seed)) n = len(X) # Get training/test set size if args.train > 1: train_size = int(args.train) else: train_size = int(args.train * n) if args.test > 1: test_size = int(args.test) else: test_size = int(args.test * n) log('Training set size: {}. Test set size: {}.'.format(