Пример #1
0
    def load(self, input_file, import_dir=None, vehicle_type='economy', remove_outliers=False):
        if import_dir is None:
            import_dir = self.import_dir
        import_dir = os.path.join(self.base_dir, import_dir)

        input_file = os.path.join(import_dir, input_file)

        all_data, input_file_path = self.import_(input_file)

        # we know mmr is highly correlated with selling price
        # remove outliers based on this correlation
        if remove_outliers:
            if len(all_data) > 5:
                Inliers = RemoveOutliers(all_data)
                all_data = Inliers.mmr(contamination=0.05)
            else:
                raise FeatureException('not enough data')

        self.all_data = all_data
        self.input_file_path = input_file_path

        return self
Пример #2
0
def get_dataset(fname, remove_initial_outliers=True, extra_categorical_exclusions=None, extra_continuous_exclusions=None, expand_odometer=False, capture_condition=False, capture_state=False):

    if extra_continuous_exclusions is None:
        EXTRA_CONTINUOUS_EXCLUSIONS = []
    else:
        EXTRA_CONTINUOUS_EXCLUSIONS = extra_continuous_exclusions

    if extra_categorical_exclusions is None:
        EXTRA_CATEGORICAL_EXCLUSIONS = []
    else:
        EXTRA_CATEGORICAL_EXCLUSIONS = extra_categorical_exclusions

    data_path = os.path.abspath(os.path.join(settings.BASE_DIR, settings.CLUSTER_DIR, fname))

    dataset, _ = Importer().import_(data_path)

    # if we want to remove outliers before any training is done. we know mmr
    # is highly correlated with selling price
    if remove_initial_outliers:
        Inliers = RemoveOutliers(dataset)
        dataset = Inliers.mmr()

    poly = PolynomialFeatures(odometer_expansion_degree)

    output_vals = []
    for item in dataset:

        if item['transmission'] == '':
            item['transmission'] = 'automatic'

        if expand_odometer:
            expanded = poly.fit_transform(item['odometer'], item['sellingprice'])
            del item['odometer']

            for i, coef in enumerate(expanded[0]):
                item['odometer' + str(i)] = coef

        val = [item['sellingprice'],item['mmr'], item['vin']]

        if capture_condition:
            val.append(item['condition'])

        if capture_state:
            val.append(item['state'])

        output_vals.append(val)

        # make sure to remove selling price otherwise we will obviously
        # have strong correlation :)
        del item['sellingprice']



        # print item


        # NOTE - we remove mmr from this point forward
        for key in item.keys():
            if key in settings.EXCLUDED_CATEGORICAL_FEATURE_LABELS:
                del item[key]
            # elif key in settings.CONTINUOUS_QUANTITATIVE_FEATURE_LABELS:
            #     del item[key]
            elif key in EXTRA_CATEGORICAL_EXCLUSIONS:
                del item[key]
            elif key in EXTRA_CONTINUOUS_EXCLUSIONS:
                del item[key]
            elif key in settings.CATEGORICAL_STRING_COERCED_LABELS:
                item[key] = str(item[key])

        # print item

    return dataset, output_vals
Пример #3
0
def get_odometer_dataset(fname, remove_initial_outliers=True, extra_categorical_exclusions=None, extra_continuous_exclusions=None, condition_adjust=False, state_adjust=False):

    if extra_continuous_exclusions is None:
        EXTRA_CONTINUOUS_EXCLUSIONS = []
    else:
        EXTRA_CONTINUOUS_EXCLUSIONS = extra_continuous_exclusions

    if extra_categorical_exclusions is None:
        EXTRA_CATEGORICAL_EXCLUSIONS = []
    else:
        EXTRA_CATEGORICAL_EXCLUSIONS = extra_categorical_exclusions

    data_path = os.path.abspath(os.path.join(settings.BASE_DIR, settings.CLUSTER_DIR, fname))

    dataset, _ = Importer().import_(data_path)

    # if we want to remove outliers before any training is done. we know mmr
    # is highly correlated with selling price
    if remove_initial_outliers:
        Inliers = RemoveOutliers(dataset)
        dataset = Inliers.mmr()

    price_mmr_vin = []
    for item in dataset:

        if item['transmission'] == '':
            item['transmission'] = 'automatic'

        if condition_adjust is not False:

            offset = condition_adjust[str(item['condition'])]['stats']['avg']

            # reverse bias the condition
            item['sellingprice'] = item['sellingprice'] - offset


        if state_adjust is not False:

            offset = state_adjust[str(item['state'])]['stats']['avg']

            # reverse bias the condition
            item['sellingprice'] = item['sellingprice'] - offset

        price_mmr_vin.append([item['sellingprice'],item['mmr'], item['vin']])

        # make sure to remove selling price otherwise we will obviously
        # have strong correlation :)
        del item['sellingprice']

        # NOTE - we remove mmr from this point forward
        for key in item.keys():
            if key in settings.EXCLUDED_CATEGORICAL_FEATURE_LABELS:
                del item[key]
            # elif key in settings.CONTINUOUS_QUANTITATIVE_FEATURE_LABELS:
            #     del item[key]
            elif key in EXTRA_CATEGORICAL_EXCLUSIONS:
                del item[key]
            elif key in EXTRA_CONTINUOUS_EXCLUSIONS:
                del item[key]
            elif key in settings.CATEGORICAL_STRING_COERCED_LABELS:
                item[key] = str(item[key])


        # print item

    # price_mmr_vin.append([item['sellingprice'],item['mmr'], item['vin']])
    #
    # # make sure to remove selling price otherwise we will obviously
    # # have strong correlation :)
    # del item['sellingprice']
    #
    return dataset, price_mmr_vin