import re from data_helper.names import keys, get_filename from data_helper.features import Condition, FeatureException import numpy as np from operator import itemgetter import settings import matplotlib.pyplot as plt from util.files import Exporter conditions = {} for key in keys(): if len(re.findall('_', key)) == 2: fname = get_filename(key) try: ConditionStats = Condition(import_dir=settings.CLUSTER_DIR).load(fname, remove_outliers=True).make_condition_stats() except FeatureException as e: print str(e) else: stats = ConditionStats.condition_stats('usa') for condition, val in stats.iteritems(): try: conditions[condition] = conditions[condition] + val['prices_centered'] except KeyError: try: conditions[condition] = val['prices_centered'] except KeyError: pass results = [] for condition, offset in conditions.iteritems(): results.append([float(condition), np.average(offset)])
def test(FID, show_price_pairs=default_show_price_pairs, retrain=True): dataset, price_mmr_state_condition_vin = get_dataset( get_filename(FID), remove_initial_outliers=False, extra_continuous_exclusions=EXTRA_CONTINUOUS_EXCLUSIONS, extra_categorical_exclusions=EXTRA_CATEGORICAL_EXCLUSIONS, expand_odometer=True, capture_condition=True, capture_state=True) vec = DictVectorizer() vectors = vec.fit_transform(dataset).toarray() features = vec.get_feature_names() # print features # create a testing set X_train, X_test, y_train, y_test = train_test_split( vectors, price_mmr_state_condition_vin, test_size=0.3) price_train = [p[0] for p in y_train] ############################################################################### # IDEAL # n_samples > n_features ** 2 sample_size = len(X_train) min_sample_size = len(features) ** 2 print 'training set: %s samples, %s features^2' % (sample_size, min_sample_size) # get the best classifier for the original full set of data best_clf = train(X_train, price_train) # predict targets on the same set of data predicted_y, _, _ = predict.run(best_clf, X_train, y_train, show_prices=False) if retrain: # remove the worst performers and consider them outliers X_train, y_train = filter_worst(X_train, y_train, predicted_y) price_train = [p[0] for p in y_train] # retrain the model best_clf = train(X_train, price_train) sample_size = len(X_train) min_sample_size = len(features) ** 2 print '\ntraining set (outliers removed): %s samples, %s features^2' % (sample_size, min_sample_size) if sample_size > min_sample_size: print 'sufficient sample to feature ratio' else: print 'WARN - insufficient sample to feature ratio to be highly confident of fit' # n_samples > n_features ** 2 price_test = [p[0] for p in y_test] # scores = cross_val_score(best_clf, vectors, selling_prices, cv=5, scoring='r2') scores = cross_val_score(best_clf, X_test, price_test, cv=5, scoring='r2') print 'Grid Search Results' print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) _, score, explained_variance = predict.run(best_clf, X_test, y_test, offset_state=False, offset_condition=False, show_prices=show_price_pairs) print 'held out test sample results:' print 'test r^2 score: %s' % score print 'test explained variance: %s' % explained_variance print '\n'