def randomize_speed(df, contain_zero): df = df.copy() # Round down speed, need more caution if contain_zero: speed_redistribution_info = 'Redistribute upward, e.g. 0 -> [0,1]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(0, 1))) else: speed_redistribution_info = 'Redistribute downward, e.g. 1 -> [0,1]' df['speed_ran'] = df['speed'].apply( lambda x: (x + np.random.uniform(-1, 0)) if x > 0 else x) max_speed = df.speed.max() df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Original Data') df['speed_ran'].hist(bins=arange(0, max_speed, 0.5), alpha=0.5, label='Redistributed Data') print speed_redistribution_info plt_configure(xlabel="Speed", ylabel="Frequency", legend=True, figsize=(8, 3)) df['speed'] = df['speed_ran'] df.drop(['speed_ran'], 1, inplace=True) return df, speed_redistribution_info
def is_with_too_many_zero(df, threshold=1.5): too_many_zero = False bins = arange(0, df.speed.max()) count, _ = np.histogram(df['speed'], bins=bins) null_wind_frequency = count[0]/len(df) if count[0]/count[1] >= threshold: df['speed'].plot(kind='hist', bins=bins, alpha=0.5) plt_configure(figsize=(4, 3), title='Original speed distribution') print ' Too many zeros' too_many_zero = True return too_many_zero, null_wind_frequency
def is_with_too_many_zero(df, threshold=1.5): too_many_zero = False bins = arange(0, df.speed.max()) count, _ = np.histogram(df['speed'], bins=bins) null_wind_frequency = count[0] / len(df) if count[0] / count[1] >= threshold: df['speed'].plot(kind='hist', bins=bins, alpha=0.5) plt_configure(figsize=(4, 3), title='Original speed distribution') print ' Too many zeros' too_many_zero = True return too_many_zero, null_wind_frequency
def plot_sectoral_comparison(gmm, weibull, direction, datasize): from plot_print_helper import plt_configure _, gmm_mean = nominal_avg_and_weight_avg(datasize, gmm) _, weibull_mean = nominal_avg_and_weight_avg(datasize, weibull) line, = plt.plot(direction, gmm, '-', label = 'GMM', marker='o') plt.axhline(gmm_mean, linestyle='--', color = line.get_color(), label ='GMM weighted average') line,= plt.plot(direction, weibull, '-', label = 'Weibull', marker='o') plt.axhline(weibull_mean, linestyle='--', color = line.get_color(), label ='Weibull weighted average') plt_configure(xlabel='Direction', legend={'loc':'best'},figsize=(5, 3)) plt.locator_params(axis='y', nbins=5)
def plot_sectoral_comparison(gmm, weibull, direction, datasize): from plot_print_helper import plt_configure _, gmm_mean = nominal_avg_and_weight_avg(datasize, gmm) _, weibull_mean = nominal_avg_and_weight_avg(datasize, weibull) line, = plt.plot(direction, gmm, '-', label = 'GMM', marker='o') plt.axhline(gmm_mean, linestyle='--', color = line.get_color(), label ='GMM weighted average') line,= plt.plot(direction, weibull, '-', label = 'Weibull', marker='o') plt.axhline(weibull_mean, linestyle='--', color = line.get_color(), label ='Weibull weighted average') plt_configure(xlabel='Direction', legend={'loc':'best'},figsize=(4.5, 2.5)) plt.locator_params(axis='y', nbins=5)
def fill_direction_999(df, SECTOR_LENGTH): # df = df.copy() fig = plt.figure() df['wind_type'].value_counts().plot( kind='bar', title='Wind Types Comprisement', figsize=(4, 3)) fig = plt.figure() bins = arange(0, df.dir.max() + 100, 10) df['dir'].plot(kind='hist', alpha=0.5, bins=bins, label='before interpolation') df['dir'] = df.apply(lambda x: np.nan if x.dir == 999 else x.dir, axis=1) df['dir'] = df['dir'].interpolate() // SECTOR_LENGTH * SECTOR_LENGTH df['dir'].plot(kind='hist', alpha=0.5, bins=bins, label='after interpolation') plt_configure(title='Dir 999 record handling comparison', figsize=(8, 3), legend={'loc': 'best'}) return df
def randomize_angle(df, DIR_REDISTRIBUTE, sector_span = 10): df = df.copy() if DIR_REDISTRIBUTE == 'even': df['dir_ran'] = df['dir'].apply(lambda x: (x + np.random.uniform(-sector_span/2,sector_span/2))) else: df['dir_ran'] = df['dir'].apply(lambda x: (x + np.random.uniform(0,sector_span))) bins=arange(0, 360+10, 5) df['dir'].hist(bins=bins, alpha=0.5, label='Original Data') bins=arange(0, 360+10, 1) df['dir_ran'].hist(bins=bins, alpha=0.5, label='Redistributed Data') plt_configure(xlabel="Direction", ylabel="Frequency", tight='x', legend={'loc':'best'}, figsize=(8, 3)) df['dir']=df['dir_ran'] df.drop(['dir_ran'], 1,inplace=True) return df
def knot_unit_detect(df): # df = df.copy() # 1. Determine whether using knot unit df['decimal'] = df.speed % 1 df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3)) knot_unit = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False # 2. Convert into knot unit if knot_unit: df['speed'] = df['speed'] * 1.943845 df['decimal'] = df.speed % 1 df.decimal.hist(alpha=0.5, label='knot') # need more elaboration, some is not near an integer df['speed'] = df['speed'].apply(lambda x: int(round(x))) plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution') return knot_unit, df
def randomize_speed(df, contain_zero): df = df.copy() # Round down speed, need more caution if contain_zero: speed_redistribution_info = 'Redistribute upward, e.g. 0 -> [0,1]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(0,1))) else: speed_redistribution_info = 'Redistribute downward, e.g. 1 -> [0,1]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(-1,0)) if x > 0 else x) max_speed = df.speed.max() df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Original Data') df['speed_ran'].hist(bins=arange(0, max_speed, 0.5), alpha=0.5, label='Redistributed Data') print speed_redistribution_info plt_configure(xlabel="Speed", ylabel="Frequency", legend=True, figsize=(8, 3)) df['speed']=df['speed_ran'] df.drop(['speed_ran'], 1, inplace=True) return df, speed_redistribution_info
def knot_unit_detect(df): # df = df.copy() # 1. Determine whether using knot unit df['decimal'] = df.speed % 1 df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3)) knot_unit = True if len( df.query('decimal >= 0.2')) / len(df) > 0.3 else False # 2. Convert into knot unit if knot_unit: df['speed'] = df['speed'] * 1.943845 df['decimal'] = df.speed % 1 df.decimal.hist(alpha=0.5, label='knot') # need more elaboration, some is not near an integer df['speed'] = df['speed'].apply(lambda x: int(round(x))) plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution') return knot_unit, df
def randomize_speed(df, redistribute_method='round_up'): df = df.copy() # Round down speed, need more caution if redistribute_method == 'round_up': speed_redistribution_info = 'Redistribute upward, e.g. 0 -> [0,1]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(0,1))) elif redistribute_method == 'round_down': speed_redistribution_info = 'Redistribute downward, e.g. 1 -> [0,1]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(-1,0)) if x > 0 else x) elif redistribute_method == 'even': speed_redistribution_info = 'Redistribute evenly, e.g. 0 -> [0, 0.5]; 1 -> [0.5,1.5]' df['speed_ran'] = df['speed'].apply(lambda x: (x + np.random.uniform(-0.5,0.5)) if x > 0 else x+ np.random.uniform(0, 0.5)) max_speed = df.speed.max() df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Original Data') df['speed_ran'].hist(bins=arange(0, max_speed, 0.5), alpha=0.5, label='Redistributed Data') print(speed_redistribution_info) plt_configure(xlabel="Speed", ylabel="Frequency", legend=True, figsize=(8, 3)) df['speed']=df['speed_ran'] df.drop(['speed_ran'], 1, inplace=True) return df, speed_redistribution_info