def load(self, input_file, import_dir=None, vehicle_type='economy', remove_outliers=False): if import_dir is None: import_dir = self.import_dir import_dir = os.path.join(self.base_dir, import_dir) input_file = os.path.join(import_dir, input_file) all_data, input_file_path = self.import_(input_file) # we know mmr is highly correlated with selling price # remove outliers based on this correlation if remove_outliers: if len(all_data) > 5: Inliers = RemoveOutliers(all_data) all_data = Inliers.mmr(contamination=0.05) else: raise FeatureException('not enough data') self.all_data = all_data self.input_file_path = input_file_path return self
def get_dataset(fname, remove_initial_outliers=True, extra_categorical_exclusions=None, extra_continuous_exclusions=None, expand_odometer=False, capture_condition=False, capture_state=False): if extra_continuous_exclusions is None: EXTRA_CONTINUOUS_EXCLUSIONS = [] else: EXTRA_CONTINUOUS_EXCLUSIONS = extra_continuous_exclusions if extra_categorical_exclusions is None: EXTRA_CATEGORICAL_EXCLUSIONS = [] else: EXTRA_CATEGORICAL_EXCLUSIONS = extra_categorical_exclusions data_path = os.path.abspath(os.path.join(settings.BASE_DIR, settings.CLUSTER_DIR, fname)) dataset, _ = Importer().import_(data_path) # if we want to remove outliers before any training is done. we know mmr # is highly correlated with selling price if remove_initial_outliers: Inliers = RemoveOutliers(dataset) dataset = Inliers.mmr() poly = PolynomialFeatures(odometer_expansion_degree) output_vals = [] for item in dataset: if item['transmission'] == '': item['transmission'] = 'automatic' if expand_odometer: expanded = poly.fit_transform(item['odometer'], item['sellingprice']) del item['odometer'] for i, coef in enumerate(expanded[0]): item['odometer' + str(i)] = coef val = [item['sellingprice'],item['mmr'], item['vin']] if capture_condition: val.append(item['condition']) if capture_state: val.append(item['state']) output_vals.append(val) # make sure to remove selling price otherwise we will obviously # have strong correlation :) del item['sellingprice'] # print item # NOTE - we remove mmr from this point forward for key in item.keys(): if key in settings.EXCLUDED_CATEGORICAL_FEATURE_LABELS: del item[key] # elif key in settings.CONTINUOUS_QUANTITATIVE_FEATURE_LABELS: # del item[key] elif key in EXTRA_CATEGORICAL_EXCLUSIONS: del item[key] elif key in EXTRA_CONTINUOUS_EXCLUSIONS: del item[key] elif key in settings.CATEGORICAL_STRING_COERCED_LABELS: item[key] = str(item[key]) # print item return dataset, output_vals
def get_odometer_dataset(fname, remove_initial_outliers=True, extra_categorical_exclusions=None, extra_continuous_exclusions=None, condition_adjust=False, state_adjust=False): if extra_continuous_exclusions is None: EXTRA_CONTINUOUS_EXCLUSIONS = [] else: EXTRA_CONTINUOUS_EXCLUSIONS = extra_continuous_exclusions if extra_categorical_exclusions is None: EXTRA_CATEGORICAL_EXCLUSIONS = [] else: EXTRA_CATEGORICAL_EXCLUSIONS = extra_categorical_exclusions data_path = os.path.abspath(os.path.join(settings.BASE_DIR, settings.CLUSTER_DIR, fname)) dataset, _ = Importer().import_(data_path) # if we want to remove outliers before any training is done. we know mmr # is highly correlated with selling price if remove_initial_outliers: Inliers = RemoveOutliers(dataset) dataset = Inliers.mmr() price_mmr_vin = [] for item in dataset: if item['transmission'] == '': item['transmission'] = 'automatic' if condition_adjust is not False: offset = condition_adjust[str(item['condition'])]['stats']['avg'] # reverse bias the condition item['sellingprice'] = item['sellingprice'] - offset if state_adjust is not False: offset = state_adjust[str(item['state'])]['stats']['avg'] # reverse bias the condition item['sellingprice'] = item['sellingprice'] - offset price_mmr_vin.append([item['sellingprice'],item['mmr'], item['vin']]) # make sure to remove selling price otherwise we will obviously # have strong correlation :) del item['sellingprice'] # NOTE - we remove mmr from this point forward for key in item.keys(): if key in settings.EXCLUDED_CATEGORICAL_FEATURE_LABELS: del item[key] # elif key in settings.CONTINUOUS_QUANTITATIVE_FEATURE_LABELS: # del item[key] elif key in EXTRA_CATEGORICAL_EXCLUSIONS: del item[key] elif key in EXTRA_CONTINUOUS_EXCLUSIONS: del item[key] elif key in settings.CATEGORICAL_STRING_COERCED_LABELS: item[key] = str(item[key]) # print item # price_mmr_vin.append([item['sellingprice'],item['mmr'], item['vin']]) # # # make sure to remove selling price otherwise we will obviously # # have strong correlation :) # del item['sellingprice'] # return dataset, price_mmr_vin