Exemplo n.º 1
0
    def get_summary(self, data, time_low, time_high, IQR=1.5):
        line = {}

        trip_total = data["trip_total"]
        quan = np.log(trip_total).quantile([0, 0.25, 0.5, 0.75, 1])
        iqr = stats.iqr(np.log(trip_total).values)
        if time_low >= 3600:
            fmt = '%H hr'
        else:
            fmt = '%M min'
        line['time'] = ' ~ '.join((time.strftime(fmt, time.gmtime(time_low)),
                                   time.strftime(fmt, time.gmtime(time_high))))
        line['lower'] = np.exp(quan[0.25] - IQR * iqr)
        line['upper'] = np.exp(quan[0.75] + IQR * iqr)

        line['avg_fare'] = np.mean(data["fare"])
        line['avg_model_fare'] = np.mean(data["model_fare"])

        to_drop = data[(data["trip_total"] < line['lower']) |
                       (data["trip_total"] > line['upper'])]

        logger.info("{}: {:.5%} are outside IQR range".format(
            line['time'],
            len(to_drop) / len(data)))

        return line, list(to_drop.index)
Exemplo n.º 2
0
    def drop_at_threshhold(self, data, metric, thresh, islarger=True):
        if islarger:
            logger.info("Dropping any {} more than {}".format(metric, thresh))
            index = data.index[data[metric] >= thresh]
        else:
            logger.info("Dropping any {} less than {}".format(metric, thresh))
            index = data.index[data[metric] <= thresh]

        return data.drop(index)
Exemplo n.º 3
0
    def model_fares_outliers(self, data, rules=[]):
        """

        Args:
            data ([type]): [description]
            rules (dict, optional): Defaults to {}. Sample format is
                [{"thresh": 200, "values": ("abs_fare_diff", 50)},
                 {"thresh": np.inf, "values": ("abs_pct_diff", 1)}]
                This means for trip_total ranges from 0 - 200, fare_diff > 50
                will be marked as outliers. For trip_total 200+, abs_pct_diff>1
                will be marked as outliers.

        Returns:
            pandas.DataFrame: Additional columns such as
            * modeled price fares,
            * the diffrence between modeled price and actual price
            * the absolute percentage diffrence between two prices
            * marked as outliers if satisfied the rules
        """

        # calculate the distance based on pickup and dropoff location as a
        # reference in some cases, trip_miles is not acurate.
        data["model_miles"] = haversine_np_df(data)

        # calculate a fare, based on maximum of trip_miles and model_miles.
        data["model_fare"] = 3.25 + 2.25 * data[["trip_miles", "model_miles"]]\
            .max(axis=1)+0.25*data["trip_seconds"]/36 + 3.5

        # fare difference and absolute percent change, will be used to check
        # outlier later
        data["abs_fare_diff"] = abs(data["fare"] - data["model_fare"])
        data["abs_pct_diff"] = abs(data["abs_fare_diff"] / data["model_fare"])

        data['Model_outliers'] = 0
        for i, rule in enumerate(rules):
            outliers = data[(data['trip_total'] <= rule["thresh"])
                            & (data[rule["values"][0]] >= rule["values"][1])]
            logger.info("{0:.5%} satisfied rules {1}".format(
                len(outliers) / len(data), i))
            data.loc[outliers.index, 'Model_outliers'] = 1

        return data
Exemplo n.º 4
0
def clean_chitax_csv(path, h5=config.get_path_taxi(), path_mapper=None):
    """ Clean the data from the original taxi CSV files

    Args:
        path ([type]): [description]
        h5 ([type], optional): Defaults to config.get_path_taxi().
        path_mapper ([type], optional): Defaults to None. [description]
    """
    logger.info('Reading File {}'.format(path))
    cols = ChiTaxiFormat()

    df = pd.read_csv(path, usecols=cols.ALL)

    # Cerate datetime values
    df[cols.TIME[0]] = pd.to_datetime(df[cols.TIME[0]],
                                      format='%m/%d/%Y %I:%M:%S %p')
    df[cols.TIME[1]] = pd.to_datetime(df[cols.TIME[1]],
                                      format='%m/%d/%Y %I:%M:%S %p')
    # Clean numerical dollar values
    df[cols.NUMBERS] = df[cols.NUMBERS].apply(
        lambda col: col.str.replace('$', '').astype(np.float), axis=0)

    if not path_mapper:
        path_mapper = cols.MAPPER
    ids = pd.read_json(path_mapper)[['taxi_id']].reset_index()

    df = pd.merge(ids, df, left_on='taxi_id', right_on='Taxi ID')
    df.drop(['Taxi ID', 'taxi_id'], axis=1, inplace=True)
    df[cols.CATS].astype('category')
    df.columns = cols.COLS

    logger.info('Finished data cleaning, now switched to saving HDF5')
    h5_path = os.path.join(config.get_config()['data'], h5)
    if not os.path.exists(h5_path):
        df.to_hdf(h5_path,
                  'table',
                  append=True,
                  format='table',
                  data_columns=cols.COLS,
                  min_itemsize={'company': 50})
    else:
        logger.info("HD5 Exists, we are appeding to the existing h5")
        store = pd.HDFStore(os.path.join(config.get_config()['data'], h5))
        # Specifically using append method since each dataset has uneven
        # company column strings size
        store.append('table',
                     df,
                     data_columns=cols.COLS,
                     min_itemsize={'company': 50})
    logger.info('HDF converted')
Exemplo n.º 5
0
    def wrapper(*args, **kargs):
        if len(args) >= 2:
            data = args[1]
        elif 'data' in kargs.keys():
            data = kargs['data']

        ori_len = len(data)
        logger.info("Orignal - Rows: {}".format(ori_len))
        data = f(*args, **kargs)

        logger.info("After Drroping - Rows: {}".format(len(data)))
        logger.info("Drop Ratio: {0:.5%}".format(1 - len(data) / ori_len))
        return data
Exemplo n.º 6
0
    def merge_data(self, X, y, dropna=True):
        """ Both datasets should have taxi_id as their index.
        """
        logger.info("Merging features and labels...")
        original = len(X)
        df = pd.merge(X, y, left_index=True, right_index=True, how='left')
        nans = df.iloc[:, -1].isna().sum()

        logger.info("We have {} unique ids in 2015".format(original))
        logger.info("{} of them ({:.5%}) dropped (nan) in 2016"
                    .format(nans, nans/original))
        if not dropna:
            df.iloc[:, -1].fillna(0, inplace=True)
        else:
            df.dropna(inplace=True)
        return df
Exemplo n.º 7
0
    def drop_null_values(self, data, metric):
        logger.info("Dropping NA values at {}".format(metric))

        index = data.index[data[metric].apply(pd.isnull)]
        return data.drop(index)
Exemplo n.º 8
0
    def drop_zero_values(self, data, metric):
        logger.info("Dropping 0 value {}".format(metric))

        index = data.index[(data[metric] == 0) |
                           (data[metric].apply(pd.isnull))]
        return data.drop(index)
Exemplo n.º 9
0
    def drop_duplicates(self, data, info='duplications'):
        fmt = ChiTaxiFormat()
        logger.info("Dropping duplications")

        return data.drop_duplicates(subset=fmt.COLS, keep=False)
Exemplo n.º 10
0
    def end_to_end_clean(self, data, filename='temp.feather'):
        # TODO: Dynamic responses to user inputs
        logger.info("STEP1: Duplications")
        data = self.drop_duplicates(data)

        logger.info("STEP2: Trips")
        data = self.drop_at_threshhold(data,
                                       metric='trip_total',
                                       thresh=0.1,
                                       islarger=False)

        logger.info("STEP3: Miles")
        data = self.drop_at_threshhold(data, metric='trip_miles', thresh=1000)

        logger.info("STEP4: Extras")
        data = self.drop_at_threshhold(data, metric='extras', thresh=100)

        logger.info("STEP5: NANs")
        data = self.drop_null_values(data, metric='trip_seconds')
        data = self.drop_null_values(data, metric='trip_total')

        logger.info("STEP6: Fare Models")
        rules = [{
            "thresh": 200,
            "values": ("abs_fare_diff", 50)
        }, {
            "thresh": np.inf,
            "values": ("abs_pct_diff", 1)
        }]
        data = self.model_fares_outliers(data, rules)

        logger.info("STEP7: Time Range Model")
        data = self.time_range_outliers(data, IQR=1.5)[0]

        logger.info("STEP8: Combined Two Models")
        data = self.drop_outliers_one(data)

        loader.save_as_feather(data, filename)
        logger.info("DONE!")