def get_summary(self, data, time_low, time_high, IQR=1.5): line = {} trip_total = data["trip_total"] quan = np.log(trip_total).quantile([0, 0.25, 0.5, 0.75, 1]) iqr = stats.iqr(np.log(trip_total).values) if time_low >= 3600: fmt = '%H hr' else: fmt = '%M min' line['time'] = ' ~ '.join((time.strftime(fmt, time.gmtime(time_low)), time.strftime(fmt, time.gmtime(time_high)))) line['lower'] = np.exp(quan[0.25] - IQR * iqr) line['upper'] = np.exp(quan[0.75] + IQR * iqr) line['avg_fare'] = np.mean(data["fare"]) line['avg_model_fare'] = np.mean(data["model_fare"]) to_drop = data[(data["trip_total"] < line['lower']) | (data["trip_total"] > line['upper'])] logger.info("{}: {:.5%} are outside IQR range".format( line['time'], len(to_drop) / len(data))) return line, list(to_drop.index)
def drop_at_threshhold(self, data, metric, thresh, islarger=True): if islarger: logger.info("Dropping any {} more than {}".format(metric, thresh)) index = data.index[data[metric] >= thresh] else: logger.info("Dropping any {} less than {}".format(metric, thresh)) index = data.index[data[metric] <= thresh] return data.drop(index)
def model_fares_outliers(self, data, rules=[]): """ Args: data ([type]): [description] rules (dict, optional): Defaults to {}. Sample format is [{"thresh": 200, "values": ("abs_fare_diff", 50)}, {"thresh": np.inf, "values": ("abs_pct_diff", 1)}] This means for trip_total ranges from 0 - 200, fare_diff > 50 will be marked as outliers. For trip_total 200+, abs_pct_diff>1 will be marked as outliers. Returns: pandas.DataFrame: Additional columns such as * modeled price fares, * the diffrence between modeled price and actual price * the absolute percentage diffrence between two prices * marked as outliers if satisfied the rules """ # calculate the distance based on pickup and dropoff location as a # reference in some cases, trip_miles is not acurate. data["model_miles"] = haversine_np_df(data) # calculate a fare, based on maximum of trip_miles and model_miles. data["model_fare"] = 3.25 + 2.25 * data[["trip_miles", "model_miles"]]\ .max(axis=1)+0.25*data["trip_seconds"]/36 + 3.5 # fare difference and absolute percent change, will be used to check # outlier later data["abs_fare_diff"] = abs(data["fare"] - data["model_fare"]) data["abs_pct_diff"] = abs(data["abs_fare_diff"] / data["model_fare"]) data['Model_outliers'] = 0 for i, rule in enumerate(rules): outliers = data[(data['trip_total'] <= rule["thresh"]) & (data[rule["values"][0]] >= rule["values"][1])] logger.info("{0:.5%} satisfied rules {1}".format( len(outliers) / len(data), i)) data.loc[outliers.index, 'Model_outliers'] = 1 return data
def clean_chitax_csv(path, h5=config.get_path_taxi(), path_mapper=None): """ Clean the data from the original taxi CSV files Args: path ([type]): [description] h5 ([type], optional): Defaults to config.get_path_taxi(). path_mapper ([type], optional): Defaults to None. [description] """ logger.info('Reading File {}'.format(path)) cols = ChiTaxiFormat() df = pd.read_csv(path, usecols=cols.ALL) # Cerate datetime values df[cols.TIME[0]] = pd.to_datetime(df[cols.TIME[0]], format='%m/%d/%Y %I:%M:%S %p') df[cols.TIME[1]] = pd.to_datetime(df[cols.TIME[1]], format='%m/%d/%Y %I:%M:%S %p') # Clean numerical dollar values df[cols.NUMBERS] = df[cols.NUMBERS].apply( lambda col: col.str.replace('$', '').astype(np.float), axis=0) if not path_mapper: path_mapper = cols.MAPPER ids = pd.read_json(path_mapper)[['taxi_id']].reset_index() df = pd.merge(ids, df, left_on='taxi_id', right_on='Taxi ID') df.drop(['Taxi ID', 'taxi_id'], axis=1, inplace=True) df[cols.CATS].astype('category') df.columns = cols.COLS logger.info('Finished data cleaning, now switched to saving HDF5') h5_path = os.path.join(config.get_config()['data'], h5) if not os.path.exists(h5_path): df.to_hdf(h5_path, 'table', append=True, format='table', data_columns=cols.COLS, min_itemsize={'company': 50}) else: logger.info("HD5 Exists, we are appeding to the existing h5") store = pd.HDFStore(os.path.join(config.get_config()['data'], h5)) # Specifically using append method since each dataset has uneven # company column strings size store.append('table', df, data_columns=cols.COLS, min_itemsize={'company': 50}) logger.info('HDF converted')
def wrapper(*args, **kargs): if len(args) >= 2: data = args[1] elif 'data' in kargs.keys(): data = kargs['data'] ori_len = len(data) logger.info("Orignal - Rows: {}".format(ori_len)) data = f(*args, **kargs) logger.info("After Drroping - Rows: {}".format(len(data))) logger.info("Drop Ratio: {0:.5%}".format(1 - len(data) / ori_len)) return data
def merge_data(self, X, y, dropna=True): """ Both datasets should have taxi_id as their index. """ logger.info("Merging features and labels...") original = len(X) df = pd.merge(X, y, left_index=True, right_index=True, how='left') nans = df.iloc[:, -1].isna().sum() logger.info("We have {} unique ids in 2015".format(original)) logger.info("{} of them ({:.5%}) dropped (nan) in 2016" .format(nans, nans/original)) if not dropna: df.iloc[:, -1].fillna(0, inplace=True) else: df.dropna(inplace=True) return df
def drop_null_values(self, data, metric): logger.info("Dropping NA values at {}".format(metric)) index = data.index[data[metric].apply(pd.isnull)] return data.drop(index)
def drop_zero_values(self, data, metric): logger.info("Dropping 0 value {}".format(metric)) index = data.index[(data[metric] == 0) | (data[metric].apply(pd.isnull))] return data.drop(index)
def drop_duplicates(self, data, info='duplications'): fmt = ChiTaxiFormat() logger.info("Dropping duplications") return data.drop_duplicates(subset=fmt.COLS, keep=False)
def end_to_end_clean(self, data, filename='temp.feather'): # TODO: Dynamic responses to user inputs logger.info("STEP1: Duplications") data = self.drop_duplicates(data) logger.info("STEP2: Trips") data = self.drop_at_threshhold(data, metric='trip_total', thresh=0.1, islarger=False) logger.info("STEP3: Miles") data = self.drop_at_threshhold(data, metric='trip_miles', thresh=1000) logger.info("STEP4: Extras") data = self.drop_at_threshhold(data, metric='extras', thresh=100) logger.info("STEP5: NANs") data = self.drop_null_values(data, metric='trip_seconds') data = self.drop_null_values(data, metric='trip_total') logger.info("STEP6: Fare Models") rules = [{ "thresh": 200, "values": ("abs_fare_diff", 50) }, { "thresh": np.inf, "values": ("abs_pct_diff", 1) }] data = self.model_fares_outliers(data, rules) logger.info("STEP7: Time Range Model") data = self.time_range_outliers(data, IQR=1.5)[0] logger.info("STEP8: Combined Two Models") data = self.drop_outliers_one(data) loader.save_as_feather(data, filename) logger.info("DONE!")