def cov_estimation(list_of_recarrays, index_name, pair_wise=False): def get_the_other_name(rec, index_name): assert len(rec.dtype.names) == 2 name = [nm for nm in rec.dtype.names if nm != index_name] assert len(name) == 1 return name[0] for array in list_of_recarrays: array[get_the_other_name(array, index_name)] = winsorize(array[get_the_other_name(array, index_name)], 99) nn = len(list_of_recarrays) if not pair_wise: new_rec = list_of_recarrays[0] for ii in range(1, nn): new_rec = rec_join(index_name, new_rec, list_of_recarrays[ii], jointype='inner', defaults=None, r1postfix='', r2postfix=str(ii+1)) dat_mat = np.c_[[new_rec[nm] for nm in new_rec.dtype.names if nm != index_name]] covmat = np.cov(dat_mat) else : covmat = np.zeros((nn, nn)) for ii in range(0, nn): covmat[ii,ii] = list_of_recarrays[ii][get_the_other_name(list_of_recarrays[ii], index_name)].var() for jj in range(ii+1, nn): new_rec = rec_join(index_name, list_of_recarrays[ii], list_of_recarrays[jj], jointype='inner', defaults=None, r1postfix='1', r2postfix='2') dat_mat = np.c_[[new_rec[nm] for nm in new_rec.dtype.names if nm != index_name]] tmp_cov = np.cov(dat_mat)[0,1] covmat[ii,jj] = tmp_cov covmat[jj,ii] = tmp_cov return covmat
def create_species_plot_count_file(self): p = self.parameter_parser # Store list of plot IDs into a string if this variable hasn't # yet been created if not hasattr(self, 'id_str'): self.id_str = self._get_id_string() spp_plot_table = self.plot_db.get_species_plot_counts(self.id_str) spp_plot_file = p.model_directory + '/' + p.model_type + \ '_spp_plot_counts.csv' if p.model_type in p.imagery_model_types: utilities.rec2csv(spp_plot_table, spp_plot_file) else: # Create 2 ID strings for non-imagery models, one with inventory # and Ecoplots and one with inventory plots only try: ecoplot_index = p.plot_types.index('ecoplot') except ValueError: # If Ecoplots are not already in the list, create another ID # string with them included plot_types_w_eco = p.plot_types plot_types_w_eco.append('ecoplot') plot_types_w_eco_str = ','.join(plot_types_w_eco) id_str2 = self._get_id_string(plot_types_w_eco_str) id_eco = 2 else: # If Ecoplot are already in the list, create another ID # string without them included plot_types_wo_eco = p.plot_types plot_types_wo_eco.remove('ecoplot') plot_types_wo_eco_str = ','.join(plot_types_wo_eco) id_str2 = self._get_id_string(plot_types_wo_eco_str) id_eco = 1 spp_plot_table2 = self.plot_db.get_species_plot_counts(id_str2) # Join the plot counts w/ Ecoplots to the plot counts w/o Ecoplots if id_eco == 1: joined_spp_plot_table = mlab.rec_join('SPP_LAYER', spp_plot_table, spp_plot_table2, 'leftouter') else: joined_spp_plot_table = mlab.rec_join('SPP_LAYER', spp_plot_table2, spp_plot_table, 'leftouter') utilities.rec2csv(joined_spp_plot_table, spp_plot_file)
def recs_inner_join(key, recs, postfixes): new_rec = recs[0] for ii in range(1, len(recs)): r1postfix='1' r2postfix='2' new_rec = mlab.rec_join(key, new_rec, recs[ii], jointype='inner', defaults=None, r1postfix=r1postfix, r2postfix=r2postfix) return new_rec
def create_species_plot_count_file(self): p = self.parameter_parser # Store list of plot IDs into a string if this variable hasn't # yet been created if not hasattr(self, 'id_str'): self.id_str = self._get_id_string() spp_plot_table = self.plot_db.get_species_plot_counts(self.id_str) spp_plot_file = p.model_directory + '/' + p.model_type + \ '_spp_plot_counts.csv' if p.model_type in p.imagery_model_types: utilities.rec2csv(spp_plot_table, spp_plot_file) else: # Create 2 ID strings for non-imagery models, one with inventory # and Ecoplots and one with inventory plots only try: ecoplot_index = p.plot_types.index('ecoplot') except ValueError: # If Ecoplots are not already in the list, create another ID # string with them included plot_types_w_eco = p.plot_types plot_types_w_eco.append('ecoplot') plot_types_w_eco_str = ','.join(plot_types_w_eco) id_str2 = self._get_id_string(plot_types_w_eco_str) id_eco = 2 else: # If Ecoplot are already in the list, create another ID # string without them included plot_types_wo_eco = p.plot_types plot_types_wo_eco.remove('ecoplot') plot_types_wo_eco_str = ','.join(plot_types_wo_eco) id_str2 = self._get_id_string(plot_types_wo_eco_str) id_eco = 1 spp_plot_table2 = self.plot_db.get_species_plot_counts(id_str2) # Join the plot counts w/ Ecoplots to the plot counts w/o Ecoplots if id_eco == 1: joined_spp_plot_table = mlab.rec_join( 'SPP_LAYER', spp_plot_table, spp_plot_table2, 'leftouter') else: joined_spp_plot_table = mlab.rec_join( 'SPP_LAYER', spp_plot_table2, spp_plot_table, 'leftouter') utilities.rec2csv(joined_spp_plot_table, spp_plot_file)
def get_predicted_estimates(self): # Read in the predicted raster ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly) rat = ds.GetRasterBand(1).GetDefaultRAT() # Get the cell area for converting from pixel counts to hectares gt = ds.GetGeoTransform() cell_area = gt[1] * gt[1] # Get the IDs and counts (converted to hectares) id_recs = [] nf_hectares = 0 for i in range(rat.GetRowCount()): id = rat.GetValueAsInt(i, 0) hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0 if id <= 0: nf_hectares += hectares else: id_recs.append((id, hectares)) # Release the dataset ds = None # Convert this to a recarray names = (self.id_field, 'HECTARES') ids = np.rec.fromrecords(id_recs, names=names) # Read in the attribute file sad = utilities.csv2rec(self.stand_attribute_file) # Ensure that all IDs in the id_count_dict are in the attribute data ids_1 = getattr(ids, self.id_field) ids_2 = getattr(sad, self.id_field) if not np.all(np.in1d(ids_1, ids_2)): err_msg = 'Not all values in the raster are present in the ' err_msg += 'attribute data' raise ValueError(err_msg) # Join the two recarrays together predicted_data = mlab.rec_join(self.id_field, ids, sad) return (predicted_data, nf_hectares)
import numpy as np import matplotlib.mlab as mlab r = mlab.csv2rec('../data/aapl.csv') r.sort() r1 = r[-10:] # Create a new array r2 = np.empty(12, dtype=[('date', '|O4'), ('high', np.float), ('marker', np.float)]) r2 = r2.view(np.recarray) r2.date = r.date[-17:-5] r2.high = r.high[-17:-5] r2.marker = np.arange(12) print "r1:" print mlab.rec2txt(r1) print "r2:" print mlab.rec2txt(r2) defaults = {'marker':-1, 'close':np.NaN, 'low':-4444.} for s in ('inner', 'outer', 'leftouter'): rec = mlab.rec_join(['date', 'high'], r1, r2, jointype=s, defaults=defaults) print "\n%sjoin :\n%s" % (s, mlab.rec2txt(rec))
# grab the price data off yahoo u1 = urllib.urlretrieve('http://ichart.finance.yahoo.com/table.csv?s=AAPL&d=9&e=14&f=2008&g=d&a=8&b=7&c=1984&ignore=.csv') u2 = urllib.urlretrieve('http://ichart.finance.yahoo.com/table.csv?s=GOOG&d=9&e=14&f=2008&g=d&a=8&b=7&c=1984&ignore=.csv') # load the CSV files into record arrays r1 = mlab.csv2rec(file(u1[0])) r2 = mlab.csv2rec(file(u2[0])) # compute the daily returns and add these columns to the arrays gains1 = np.zeros_like(r1.adj_close) gains2 = np.zeros_like(r2.adj_close) gains1[1:] = np.diff(r1.adj_close)/r1.adj_close[:-1] gains2[1:] = np.diff(r2.adj_close)/r2.adj_close[:-1] r1 = mlab.rec_append_fields(r1, 'gains', gains1) r2 = mlab.rec_append_fields(r2, 'gains', gains2) # now join them by date; the default postfixes are 1 and 2 r = mlab.rec_join('date', r1, r2) # long appl, short goog g = r.gains1-r.gains2 tr = (1+g).cumprod() # the total return # plot the return fig = plt.figure() ax = fig.add_subplot(111) ax.plot(r.date, tr) ax.set_title('total return: long appl, short goog') ax.grid() fig.autofmt_xdate() plt.show()
r1 = mlab.csv2rec(open(u1[0])) r2 = mlab.csv2rec(open(u2[0])) # compute the daily returns and add these columns to the arrays gains1 = np.zeros_like(r1.adj_close) gains2 = np.zeros_like(r2.adj_close) gains1[1:] = np.diff(r1.adj_close) / r1.adj_close[:-1] gains2[1:] = np.diff(r2.adj_close) / r2.adj_close[:-1] r1 = mlab.rec_append_fields(r1, 'gains', gains1) r2 = mlab.rec_append_fields(r2, 'gains', gains2) # now join them by date; the default postfixes are 1 and 2. The # default jointype is inner so it will do an intersection of dates and # drop the dates in AAPL which occurred before GOOG started trading in # 2004. r1 and r2 are reverse ordered by date since Yahoo returns # most recent first in the CSV files, but rec_join will sort by key so # r below will be properly sorted r = mlab.rec_join('date', r1, r2) # long appl, short goog g = r.gains1 - r.gains2 tr = (1 + g).cumprod() # the total return # plot the return fig, ax = plt.subplots() ax.plot(r.date, tr) ax.set_title('total return: long APPL, short GOOG') ax.grid() fig.autofmt_xdate() plt.show()
def bugtrend(milestone): baseWorkingDirectory = "/tmp/" wikiTableBaseFileName = baseWorkingDirectory + "DefectChurnReport" wikiImageFileBaseLocation = "http://metrics.arubanetworks.com/metrics/margot_autopages/" wikiContent = [] Queries = milestone.split(",") default_column_value = { "datemaxbabug_when": datetime.date(2030, 12, 1), "datebcreation_ts": datetime.date(2005, 1, 1), "cf_customers": "Aruba Internal", } for params in Queries: print "processing : " + params wikiContent = [] baseFileName = params bugReportName = baseWorkingDirectory + baseFileName + "_bugs.csv" fixedReportName = baseWorkingDirectory + baseFileName + "_fixed.csv" outputReportName = baseWorkingDirectory + baseFileName + "_merged.csv" r = mlab.csv2rec(bugReportName) s = mlab.csv2rec(fixedReportName) k = mlab.rec_join("bug_id", s, r, jointype="outer", defaults=default_column_value, r1postfix="1", r2postfix="2") t = mlab.csv2rec("/home/automation/bugzilla_tool/Org_Mapping.csv") # mlab.rec2csv(k,outputReportName,delimiter=',',missing="",missingd=None,withheader=True) org_mapping = dict(zip(t.login_name, range(len(t)))) # orgList = [] DirectorArray = np.zeros_like(k.login_name) ComponentArray = np.zeros_like(k.login_name) ManagerArray = np.zeros_like(k.login_name) for i in range(len(k)): if k[i].login_name in org_mapping.keys(): DirectorArray[i] = t[org_mapping[k[i].login_name]].director ComponentArray[i] = t[org_mapping[k[i].login_name]].functional_group ManagerArray[i] = t[org_mapping[k[i].login_name]].manager else: DirectorArray[i] = t[org_mapping["*****@*****.**"]].director ComponentArray[i] = t[org_mapping["*****@*****.**"]].functional_group ManagerArray[i] = t[org_mapping["*****@*****.**"]].manager k = mlab.rec_append_fields(k, "Director", DirectorArray) k = mlab.rec_append_fields(k, "Component", ComponentArray) k = mlab.rec_append_fields(k, "Manager", ManagerArray) mlab.rec2csv(k, outputReportName, delimiter=",", missing="", missingd=None, withheader=True) # Start preparing the data for plotting chartFileName = baseWorkingDirectory + baseFileName + ".png" plotDefectTrend(k, chartFileName, baseFileName) s = "= Overall Defect Trend = \n" wikiContent.append(s) s = wikiImageFileBaseLocation + baseFileName + ".png \n" wikiContent.append(s) # Directors = ('Murali Duvvury','Shankar','Jie Jiang') Directors = list(np.unique(np.array(k.Director))) s = "= Director level Defect Trend = \n" wikiContent.append(s) wikiTableFileName = wikiTableBaseFileName + "_" + params + ".wiki" f = open(wikiTableFileName, "w") hdrList = ("Director", "Open Defects", "Need Info", "Observe", "Resolved-Fixed", "Resolved-Other", "Incoming") printWikiTableOpen(f, hdrList) for Dir in Directors: s = Dir DirFileName = Dir.replace(" ", "_") DirRe = re.compile(s) DirReMatch = np.vectorize(lambda x: bool(DirRe.match(x))) sel = DirReMatch(np.array(k.Director)) chartFileName = baseWorkingDirectory + baseFileName + "-" + DirFileName + ".png" plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + DirFileName) printChurnReport(k[sel], Dir, f) s = wikiImageFileBaseLocation + baseFileName + "-" + DirFileName + ".png \n" wikiContent.append(s) # chartFileName = baseWorkingDirectory + baseFileName + "-" + Dir +".png" # plotDefectTrend(k[k.Director == Dir],chartFileName, baseFileName + '-' + Dir) printWikiTableClose(f) s = "= Component level Defect Trend = \n" wikiContent.append(s) ComponentList = [ ["GSM", "GSM"], ["UI-Configuration", "UI"], ["AP-Platform", "11ac"], ["Switch-Datapath", "Datapath"], ["HA-Lite", "HA-Lite"], ["Switch-Platform", "CIMU"], ["Feature-Bugs", "\w+]"], ] for c in ComponentList: s = "^\[*" + c[1] componentRe = re.compile(s) componentReMatch = np.vectorize(lambda x: bool(componentRe.match(x))) sel = np.logical_or(componentReMatch(np.array(k.short_desc)), k.name == c[0]) chartFileName = baseWorkingDirectory + baseFileName + "-" + c[0] + ".png" plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + c[0]) s = "= Keyword level Defect Trend = \n" wikiContent.append(s) KeywordList = [ ["TC-Blocker", "TC\-blocker"], ["SystemTest", "ST"], ["Smoke", "Smoke\-Failure"], ["CFT", "CFT"], ["MustFix", "MustFix"], ] for keyword in KeywordList: s = keyword[1] keywordRe = re.compile(s) keywordReMatch = np.vectorize(lambda x: bool(keywordRe.match(x))) sel = keywordReMatch(np.array(k.keywords)) chartFileName = baseWorkingDirectory + baseFileName + "-" + keyword[0] + ".png" plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + keyword[0]) # wikiTableFileName = wikiTableBaseFileName + '_' + params + '.wiki' # f = open(wikiTableFileName , 'w') hdrList = ("Manager", "Open Defects", "Need Info", "Observe", "Resolved-Fixed", "Resolved-Other", "Incoming") printWikiTableOpen(f, hdrList) ManagerList = list(np.unique(np.array(k.Manager))) today = datetime.date.today() resolvedDateRange = today + datetime.timedelta(days=-14) s = "= Manager Defect Trend = \n" wikiContent.append(s) for mgr in ManagerList: s = mgr mgrFileName = mgr.replace(" ", "_") mgrRe = re.compile(s) mgrReMatch = np.vectorize(lambda x: bool(mgrRe.match(x))) sel = mgrReMatch(np.array(k.Manager)) chartFileName = baseWorkingDirectory + baseFileName + "-" + mgrFileName + ".png" plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + mgrFileName) printChurnReport(k[sel], mgr, f) s = wikiImageFileBaseLocation + baseFileName + "-" + mgrFileName + ".png \n" wikiContent.append(s) printWikiTableClose(f) f.write("".join(wikiContent)) f.close() plt.close("all")
def customDefectTrend(chartFileName, title): default_column_value = { "datemaxbabug_when": datetime.date(2030, 12, 1), "datebcreation_ts": datetime.date(2005, 1, 1), "cf_customers": "Aruba Internal", } cfdtuple, cfdclose = cfdbug() k = mlab.rec_join( "bug_id", cfdtuple, cfdclose, jointype="outer", defaults=default_column_value, r1postfix="1", r2postfix="2" ) incomingRateList = [] fixRateList = [] fixWeekList = [] resolvedFixedRateList = [] outstandingList = [] today = datetime.date.today() toRangeDate = today + datetime.timedelta(days=-today.weekday() - 1, weeks=1) fromRangeDate = toRangeDate + relativedelta.relativedelta(weeks=-13) iterDate = fromRangeDate while iterDate <= toRangeDate: curDate = iterDate iterDate = iterDate + relativedelta.relativedelta(weeks=1) fixWeekList.append(iterDate.strftime("%m/%d")) fixRate = np.logical_and( np.logical_and(k.datemaxbabug_when > curDate, k.datemaxbabug_when <= iterDate), k.resolution <> "" ).sum() incomingRate = (np.logical_and(k.datebcreation_ts > curDate, k.datebcreation_ts <= iterDate)).sum() resolvedFixedRate = np.logical_and( np.logical_and(k.datemaxbabug_when > curDate, k.datemaxbabug_when <= iterDate), k.resolution == "FIXED" ).sum() outstandingCount = (k.datebcreation_ts <= iterDate).sum() - np.logical_and( k.datemaxbabug_when <= iterDate, k.resolution <> "" ).sum() fixRateList.append(fixRate) incomingRateList.append(incomingRate) resolvedFixedRateList.append(resolvedFixedRate) outstandingList.append(outstandingCount) width = 0.35 ind = np.arange(len(fixWeekList)) ax = plt.subplot(111) rects1 = plt.bar(ind, incomingRateList, width, color="r") rects2 = plt.bar(ind + width, fixRateList, width, color="y") rects3 = plt.bar(ind + width, resolvedFixedRateList, width, color="b") ax2 = ax.twinx() rects5 = ax2.plot(ind + width, outstandingList, "b-", linewidth=3) plt.ylabel("Defect Count") ax.set_ylabel(r"Opened / Verified / Resolved") ax2.set_ylabel(r"Outstanding Defect Count") plt.title(title + " Defect Trend" + "-%s" % (datetime.date.today())) plt.xticks(ind + width, fixWeekList) # plt.legend((rects1[0], rects2[0] ,rects3[0],rects4[0],rects5[0],rects6[0],rects7[0],rects8[0]), ('incoming', 'resolved-other', 'resolved-fixed','regression','outstanding (right-axis)','SS_P1 (right-axis)','MustFix (right-axis)','Overall (right-axis)'),loc="upper left") plt.legend( (rects1[0], rects2[0], rects3[0], rects5[0]), ("Opend", "Verified", "Resolved", "outstanding (right-axis)"), loc="upper left", ) leg = plt.gca().get_legend() ltext = leg.get_texts() plt.setp(ltext, fontsize="xx-small") autolabel(rects1, ax) autolabel(rects2, ax) plt.setp(ax.get_xticklabels(), rotation="90", fontsize=12) F = plt.gcf() F.set_size_inches(8, 6) F.savefig(chartFileName, dpi=(100)) plt.clf()
from __future__ import print_function import numpy as np import matplotlib.mlab as mlab import matplotlib.cbook as cbook datafile = cbook.get_sample_data('aapl.csv', asfileobj=False) print('loading', datafile) r = mlab.csv2rec(datafile) r.sort() r1 = r[-10:] # Create a new array r2 = np.empty(12, dtype=[('date', '|O4'), ('high', np.float), ('marker', np.float)]) r2 = r2.view(np.recarray) r2.date = r.date[-17:-5] r2.high = r.high[-17:-5] r2.marker = np.arange(12) print("r1:") print(mlab.rec2txt(r1)) print("r2:") print(mlab.rec2txt(r2)) defaults = {'marker': -1, 'close': np.NaN, 'low': -4444.} for s in ('inner', 'outer', 'leftouter'): rec = mlab.rec_join(['date', 'high'], r1, r2, jointype=s, defaults=defaults) print("\n%sjoin :\n%s" % (s, mlab.rec2txt(rec)))
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict( (getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [ x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1 ] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join( (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join(( prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join(( prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison( prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)