def get_events_data(): """Get well events - takes ~30s""" f_start = time.time() # read in event data & fix column names events = pd.read_excel(xlsx, sheetname='INTERVENTIONS_OFM') events.columns = ['UNIQUEID', 'Date', 'Year', 'Month', 'Day', 'Event'] # fix dates events['Date'] = events['Date'].dt.to_period('M').dt.to_timestamp('M') events.drop(['Year', 'Day', 'Month'], axis=1, inplace=True) # filter to shorter list of events keep_events = [ 'Chemical Stimulation', 'Cleaning', 'COMPLETION', 'Pump Replacement', 'Reactivation', 'Rod Change', 'SRP Installation', 'Workover', 'zone isolated with arena' ] events = events.loc[events['Event'].isin(keep_events)] # dummy events & filter to shorter list of events events['Event'] = events.Event.str.replace(' ', '_') events = pd.get_dummies(events, columns=['Event']) # merge multiple events per month into single row events = events.groupby(['UNIQUEID', 'Date']).sum() events = events.applymap(lambda x: int(bool(x))) events.reset_index(inplace=True) utils.print_log('events prepped: ' + utils.secs_to_hms(time.time() - f_start)) return events
def get_general_data(): """Get data from general tab - takes ~25 seconds""" f_start = time.time() # read in data gen = pd.read_excel(xlsx, sheetname='GENERAL') # some cleanup gen.drop(['WELLBORE', 'Completation_Date'], axis=1, inplace=True) gen['PRODUCER'] = gen['PRODUCER'].replace({'Y': 1, 'N': 0}) # drop columns identified by slb as unnecessary gen.drop( ['SUB_STATE', 'COMPLETION_PACKING', 'COLLAPSED', 'Completation_Type'], axis=1, inplace=True) # clean & dummy categorical columns gen['RESERVOIR'] = gen['RESERVOIR'].replace( np.nan, 'RESERVOIR(DNE)').apply( lambda x: re.search(r'.*\((.*)\)', x).group(1)).replace( 'DNE', np.nan) # gen = pd.get_dummies( gen, columns=[ 'RESERVOIR', 'WELLTYPE', 'SUB_STATE', 'COMPLETION_PACKING', 'COLLAPSED', 'Completation_Type' ] ) gen = pd.get_dummies(gen, columns=['RESERVOIR', 'WELLTYPE']) utils.print_log('general prepped: ' + utils.secs_to_hms(time.time() - f_start)) return gen
def get_production_data(): """Get monthly production data - takes ~2min""" f_start = time.time() # read in production data prod_df = pd.DataFrame() for sheet in [ 'Monthly_PROD1_OFM', 'Monthly_PROD2_OFM', 'Monthly_PROD3_OFM', 'Monthly_PROD4_OFM' ]: sheet_df = pd.read_excel(xlsx, sheetname=sheet) prod_df = pd.concat([prod_df, sheet_df], ignore_index=True) # drop missing values start = time.time() prod_df.dropna( how='all', inplace=True ) # 2208 rows in PROD2 and 617 rows in PROD3 are completely blank prod_df.dropna( subset=['OIL [bbl]', 'GAS [Mcf]', 'WATER [bbl]'], how='all', inplace=True ) # 12 rows in PROD2, 9 rows in PROD3, and 10 rows in PROD4 have no prod values for rows with an assigned UNIQUEID & date utils.print_log('drop missing: ' + utils.secs_to_hms(time.time() - start)) # resample to ensure monthly increments start = time.time() prod_df = prod_df.set_index('Date').groupby('UNIQUEID').resample( '1M').sum().reset_index() prod_df.loc[~prod_df['DAYS [days]'].isnull(), 'Well_Open'] = 1 prod_df['Well_Open'].fillna(0, inplace=True) prod_df['OIL [bbl]'].fillna(0, inplace=True) prod_df['GAS [Mcf]'].fillna(0, inplace=True) prod_df['WATER [bbl]'].fillna(0, inplace=True) prod_df.drop('DAYS [days]', axis=1, inplace=True) utils.print_log('resample: ' + utils.secs_to_hms(time.time() - start)) # add counter for months of production history start = time.time() prod_df['Well_Age'] = prod_df.groupby('UNIQUEID').cumcount() + 1 utils.print_log('age counter: ' + utils.secs_to_hms(time.time() - start)) utils.print_log('production prepped: ' + utils.secs_to_hms(time.time() - f_start)) return prod_df
def get_xycdate_data(): """Get XY coordinates, elevation, depth, and glevel - takes ~25 seconds""" f_start = time.time() # read in data xyc = pd.read_excel(xlsx, sheetname='XY-CDATE') # some cleanup xyc.drop(['CDATE', 'ALIAS', 'WELLBORE'], axis=1, inplace=True) # interpolate missing values for col in ['KBELEVATION [ft]', 'TOTALDEPTH [ft]', 'GLEVEL']: interpolate(col, xyc) utils.print_log('xyc prepped: ' + utils.secs_to_hms(time.time() - f_start)) return xyc
def get_warter_data(): """Get warter data""" f_start = time.time() # read in data warter = pd.read_excel(xlsx, sheetname='Monthly_Warter_INJ_OFM') # some cleanup warter.drop(['DAYS [days]'], axis=1, inplace=True) # resample to ensure monthly increments, fill nans with 0 warter = warter.set_index('Date').groupby('UNIQUEID').resample('M').mean() warter['WINJ [bbl]'] = warter['WINJ [bbl]'].fillna(0) warter['PINJ [psig]'] = warter['PINJ [psig]'].fillna(0) warter.reset_index(inplace=True) utils.print_log('warter prepped: ' + utils.secs_to_hms(time.time() - f_start)) return warter
def write_summary(self, popn, treatmt, tot_cycles, elapsed_time): """ Write simulation summary to file(s). Write summary data about this simulation run to two summary files, in CSV format: 'testgroup_results.csv' (master summary file for this test group) 'testgroup_paramset_results.csv' (summary for this param set only) Args ---- popn : a tumour population treatmt : a treatment regime tot_cycles : total number of cycles for this simulation elapsed_time : simulation runtime in seconds Returns ------- None """ # Get pre-crash min and max population size, and # the times at which they occurred precrash_minmax_data = analytics.precrash_minmax(treatmt, popn) min_val, min_time, max_val, max_time = precrash_minmax_data cmin_val = cmin_time = cmax_val = cmax_time = 0 went_through_crash = 'N' if analytics.went_through_crash(treatmt, popn): went_through_crash = 'Y' #Get post-crash min and max values postcrash_minmax_data = analytics.postcrash_minmax(treatmt, popn) cmin_val, cmin_time, cmax_val, cmax_time = postcrash_minmax_data #hasty fix for calculating max time # if cmax_time == 0 and recovered: # cmax_time = tot_cycles # determine whether, when and how fully the population recovered recovery_status = analytics.completion_status(self, treatmt, popn) recovered, recover_type, recover_percent = recovery_status # open files (these should have been created already) tg_summary_fpath = "{0}/{1}_results.csv".format( self.test_group_dir, self.test_group) ps_summary_fpath = "{0}/{1}_{2}_results.csv".format( self.param_set_dir, self.test_group, self.param_set) tg_results_file = open(tg_summary_fpath, 'a') ps_results_file = open(ps_summary_fpath, 'a') tg_writer = csv.writer(tg_results_file) ps_writer = csv.writer(ps_results_file) # determine proportion of tumour taken up by dominant clone if not popn.is_dead(): dom_clone_size = analytics.get_dom_clone_size(popn.subpop) dom_clone_proportion = dom_clone_size / float(popn.tumoursize) else: dom_clone_proportion = 0 # determine average depth of clones in tumour avg_depth = analytics.get_avg_depth(popn) # calculate total number of mutations total_mutns = 0 for mut_type in popn.all_mutations: total_mutns += len(popn.all_mutations[mut_type]) generated_resist_mutns = len(popn.all_mutations['r']) surviving_resist_mutns = 0 for mut in popn.all_mutations['r']: if not mut.original_clone.is_dead_end(): surviving_resist_mutns += 1 # assemble values to write summary_vals = (self.param_set, self.run_number, went_through_crash, recovered, recover_type, recover_percent, popn.opt.pro, popn.opt.die, popn.opt.mut, treatmt.crash_time, treatmt.init_select_pressure, max(popn.analytics_base.select_pressure), popn.opt.prob_mut_pos, popn.opt.prob_mut_neg, popn.opt.prob_inc_mut, popn.opt.prob_dec_mut, popn.analytics_base.tumoursize[-1], popn.analytics_base.clonecount[-1], '{:.5f}'.format(dom_clone_proportion), '{:.5f}'.format(avg_depth), popn.analytics_base.avg_mutation[-1], popn.analytics_base.avg_proliferation[-1], secs_to_hms(elapsed_time), tot_cycles, total_mutns, generated_resist_mutns, surviving_resist_mutns, min_val, min_time, max_val, max_time, cmin_val, cmin_time, cmax_val, cmax_time) tg_writer.writerow(summary_vals) ps_writer.writerow(summary_vals) tg_results_file.close() ps_results_file.close()
def get_intervals(): """Get intervals data""" f_start = time.time() # read in data intervals_df = pd.DataFrame() for sheet in ['INTERVALS_1', 'INTERVALS_2']: sheet_df = pd.read_excel(xlsx, sheetname=sheet) intervals_df = pd.concat([intervals_df, sheet_df], ignore_index=True) intervals_df[ 'DK_Diff'] = intervals_df['TOP [ft]'] - intervals_df['BASE [ft]'] intervals_df.rename(columns={'STATUS': 'DK_STATUS'}, inplace=True) intervals_df = pd.get_dummies(intervals_df, columns=['DK_STATUS']) # get aggregate level data aggregates = intervals_df.groupby('UNIQUEID')['DK'].max().to_frame( ) # start off with number of intervals per well aggregates.rename(columns={'DK': 'Total_Num_DK'}, inplace=True) for metric in ['TOP [ft]', 'BASE [ft]', 'DK_Diff']: # calc average depth across all DK new_col = 'Avg_DK_{}'.format(metric) aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].mean() # calc min depth across all DK new_col = 'Min_DK_{}'.format(metric) aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].min() # calc max depth across all DK new_col = 'Max_DK_{}'.format(metric) aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].max() aggregates.reset_index(inplace=True) # get time series data status_cols = [c for c in intervals_df if c.startswith('DK_STATUS_')] int_ts = intervals_df.groupby(['UNIQUEID', 'Date'])[status_cols].sum() open_dk = intervals_df.loc[intervals_df['DK_STATUS_OPEN'] == 1] for metric in ['TOP [ft]', 'BASE [ft]', 'DK_Diff']: # calc average depth across all open DK new_col = 'Avg_Open_DK_{}'.format(metric) int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].mean() # calc average depth across all open DK new_col = 'Min_Open_DK_{}'.format(metric) int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].min() # calc average depth across all open DK new_col = 'Max_Open_DK_{}'.format(metric) int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].max() # resample time series data from daily to monthly int_ts.reset_index(inplace=True) int_ts['Date'] = int_ts['Date'].dt.to_period('M').dt.to_timestamp('M') int_ts = int_ts.groupby(['UNIQUEID', 'Date']).last() int_ts.fillna(0, inplace=True) int_ts.reset_index(inplace=True) # combine aggregate with time series data all_interval = int_ts.merge(aggregates, how='left', on='UNIQUEID') utils.print_log('intervals prepped: ' + utils.secs_to_hms(time.time() - f_start)) return all_interval