Пример #1
0
def cov_estimation(list_of_recarrays, index_name, pair_wise=False):
    def get_the_other_name(rec, index_name):
        assert len(rec.dtype.names) == 2
        name = [nm for nm in rec.dtype.names if nm != index_name]
        assert len(name) == 1
        return name[0]
    for array in list_of_recarrays:
        array[get_the_other_name(array, index_name)] = winsorize(array[get_the_other_name(array, index_name)], 99)
    nn = len(list_of_recarrays)
    if not pair_wise:
        new_rec = list_of_recarrays[0]
        for ii in range(1, nn):
            new_rec = rec_join(index_name, new_rec, list_of_recarrays[ii], jointype='inner', defaults=None, r1postfix='', r2postfix=str(ii+1))
            dat_mat = np.c_[[new_rec[nm] for nm in new_rec.dtype.names if nm != index_name]]
            covmat = np.cov(dat_mat)
    else :
        covmat = np.zeros((nn, nn))
        for ii in range(0, nn):
            covmat[ii,ii] = list_of_recarrays[ii][get_the_other_name(list_of_recarrays[ii], index_name)].var()
            for jj in range(ii+1, nn):
                new_rec = rec_join(index_name, list_of_recarrays[ii], list_of_recarrays[jj], jointype='inner', defaults=None, r1postfix='1', r2postfix='2')
                dat_mat = np.c_[[new_rec[nm] for nm in new_rec.dtype.names if nm != index_name]]
                tmp_cov = np.cov(dat_mat)[0,1]
                covmat[ii,jj] = tmp_cov
                covmat[jj,ii] = tmp_cov
    return covmat
Пример #2
0
    def create_species_plot_count_file(self):
        p = self.parameter_parser

        # Store list of plot IDs into a string if this variable hasn't
        # yet been created
        if not hasattr(self, 'id_str'):
            self.id_str = self._get_id_string()

        spp_plot_table = self.plot_db.get_species_plot_counts(self.id_str)
        spp_plot_file = p.model_directory + '/' + p.model_type + \
                            '_spp_plot_counts.csv'
        if p.model_type in p.imagery_model_types:
            utilities.rec2csv(spp_plot_table, spp_plot_file)
        else:
            # Create 2 ID strings for non-imagery models, one with inventory
            # and Ecoplots and one with inventory plots only
            try:
                ecoplot_index = p.plot_types.index('ecoplot')
            except ValueError:
                # If Ecoplots are not already in the list, create another ID
                # string with them included
                plot_types_w_eco = p.plot_types
                plot_types_w_eco.append('ecoplot')
                plot_types_w_eco_str = ','.join(plot_types_w_eco)
                id_str2 = self._get_id_string(plot_types_w_eco_str)
                id_eco = 2
            else:
                # If Ecoplot are already in the list, create another ID
                # string without them included
                plot_types_wo_eco = p.plot_types
                plot_types_wo_eco.remove('ecoplot')
                plot_types_wo_eco_str = ','.join(plot_types_wo_eco)
                id_str2 = self._get_id_string(plot_types_wo_eco_str)
                id_eco = 1

            spp_plot_table2 = self.plot_db.get_species_plot_counts(id_str2)

            # Join the plot counts w/ Ecoplots to the plot counts w/o Ecoplots
            if id_eco == 1:
                joined_spp_plot_table = mlab.rec_join('SPP_LAYER',
                                                      spp_plot_table,
                                                      spp_plot_table2,
                                                      'leftouter')
            else:
                joined_spp_plot_table = mlab.rec_join('SPP_LAYER',
                                                      spp_plot_table2,
                                                      spp_plot_table,
                                                      'leftouter')
            utilities.rec2csv(joined_spp_plot_table, spp_plot_file)
def recs_inner_join(key, recs, postfixes):
    new_rec = recs[0]
    for ii in range(1, len(recs)):
        r1postfix='1'
        r2postfix='2'
        new_rec = mlab.rec_join(key, new_rec, recs[ii], jointype='inner', 
                                defaults=None, r1postfix=r1postfix, r2postfix=r2postfix)
    return new_rec
Пример #4
0
    def create_species_plot_count_file(self):
        p = self.parameter_parser

        # Store list of plot IDs into a string if this variable hasn't
        # yet been created
        if not hasattr(self, 'id_str'):
            self.id_str = self._get_id_string()

        spp_plot_table = self.plot_db.get_species_plot_counts(self.id_str)
        spp_plot_file = p.model_directory + '/' + p.model_type + \
                            '_spp_plot_counts.csv'
        if p.model_type in p.imagery_model_types:
            utilities.rec2csv(spp_plot_table, spp_plot_file)
        else:
            # Create 2 ID strings for non-imagery models, one with inventory
            # and Ecoplots and one with inventory plots only
            try:
                ecoplot_index = p.plot_types.index('ecoplot')
            except ValueError:
                # If Ecoplots are not already in the list, create another ID
                # string with them included
                plot_types_w_eco = p.plot_types
                plot_types_w_eco.append('ecoplot')
                plot_types_w_eco_str = ','.join(plot_types_w_eco)
                id_str2 = self._get_id_string(plot_types_w_eco_str)
                id_eco = 2
            else:
                # If Ecoplot are already in the list, create another ID
                # string without them included
                plot_types_wo_eco = p.plot_types
                plot_types_wo_eco.remove('ecoplot')
                plot_types_wo_eco_str = ','.join(plot_types_wo_eco)
                id_str2 = self._get_id_string(plot_types_wo_eco_str)
                id_eco = 1

            spp_plot_table2 = self.plot_db.get_species_plot_counts(id_str2)

            # Join the plot counts w/ Ecoplots to the plot counts w/o Ecoplots
            if id_eco == 1:
                joined_spp_plot_table = mlab.rec_join(
                    'SPP_LAYER', spp_plot_table, spp_plot_table2, 'leftouter')
            else:
                joined_spp_plot_table = mlab.rec_join(
                    'SPP_LAYER', spp_plot_table2, spp_plot_table, 'leftouter')
            utilities.rec2csv(joined_spp_plot_table, spp_plot_file)
Пример #5
0
    def get_predicted_estimates(self):

        # Read in the predicted raster
        ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly)
        rat = ds.GetRasterBand(1).GetDefaultRAT()

        # Get the cell area for converting from pixel counts to hectares
        gt = ds.GetGeoTransform()
        cell_area = gt[1] * gt[1]

        # Get the IDs and counts (converted to hectares)
        id_recs = []
        nf_hectares = 0
        for i in range(rat.GetRowCount()):
            id = rat.GetValueAsInt(i, 0)
            hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0
            if id <= 0:
                nf_hectares += hectares
            else:
                id_recs.append((id, hectares))

        # Release the dataset
        ds = None

        # Convert this to a recarray
        names = (self.id_field, 'HECTARES')
        ids = np.rec.fromrecords(id_recs, names=names)

        # Read in the attribute file
        sad = utilities.csv2rec(self.stand_attribute_file)

        # Ensure that all IDs in the id_count_dict are in the attribute data
        ids_1 = getattr(ids, self.id_field)
        ids_2 = getattr(sad, self.id_field)
        if not np.all(np.in1d(ids_1, ids_2)):
            err_msg = 'Not all values in the raster are present in the '
            err_msg += 'attribute data'
            raise ValueError(err_msg)

        # Join the two recarrays together
        predicted_data = mlab.rec_join(self.id_field, ids, sad)
        return (predicted_data, nf_hectares)
    def get_predicted_estimates(self):

        # Read in the predicted raster
        ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly)
        rat = ds.GetRasterBand(1).GetDefaultRAT()

        # Get the cell area for converting from pixel counts to hectares
        gt = ds.GetGeoTransform()
        cell_area = gt[1] * gt[1]

        # Get the IDs and counts (converted to hectares)
        id_recs = []
        nf_hectares = 0
        for i in range(rat.GetRowCount()):
            id = rat.GetValueAsInt(i, 0)
            hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0
            if id <= 0:
                nf_hectares += hectares
            else:
                id_recs.append((id, hectares))

        # Release the dataset
        ds = None

        # Convert this to a recarray
        names = (self.id_field, 'HECTARES')
        ids = np.rec.fromrecords(id_recs, names=names)

        # Read in the attribute file
        sad = utilities.csv2rec(self.stand_attribute_file)

        # Ensure that all IDs in the id_count_dict are in the attribute data
        ids_1 = getattr(ids, self.id_field)
        ids_2 = getattr(sad, self.id_field)
        if not np.all(np.in1d(ids_1, ids_2)):
            err_msg = 'Not all values in the raster are present in the '
            err_msg += 'attribute data'
            raise ValueError(err_msg)

        # Join the two recarrays together
        predicted_data = mlab.rec_join(self.id_field, ids, sad)
        return (predicted_data, nf_hectares)
Пример #7
0
import numpy as np
import matplotlib.mlab as mlab


r = mlab.csv2rec('../data/aapl.csv')
r.sort()
r1 = r[-10:]

# Create a new array
r2 = np.empty(12, dtype=[('date', '|O4'), ('high', np.float),
                            ('marker', np.float)])
r2 = r2.view(np.recarray)
r2.date = r.date[-17:-5]
r2.high = r.high[-17:-5]
r2.marker = np.arange(12)

print "r1:"
print mlab.rec2txt(r1)
print "r2:"
print mlab.rec2txt(r2)

defaults = {'marker':-1, 'close':np.NaN, 'low':-4444.}

for s in ('inner', 'outer', 'leftouter'):
    rec = mlab.rec_join(['date', 'high'], r1, r2,
            jointype=s, defaults=defaults)
    print "\n%sjoin :\n%s" % (s, mlab.rec2txt(rec))
Пример #8
0
# grab the price data off yahoo
u1 = urllib.urlretrieve('http://ichart.finance.yahoo.com/table.csv?s=AAPL&d=9&e=14&f=2008&g=d&a=8&b=7&c=1984&ignore=.csv')
u2 = urllib.urlretrieve('http://ichart.finance.yahoo.com/table.csv?s=GOOG&d=9&e=14&f=2008&g=d&a=8&b=7&c=1984&ignore=.csv')

# load the CSV files into record arrays
r1 = mlab.csv2rec(file(u1[0]))
r2 = mlab.csv2rec(file(u2[0]))

# compute the daily returns and add these columns to the arrays
gains1 = np.zeros_like(r1.adj_close)
gains2 = np.zeros_like(r2.adj_close)
gains1[1:] = np.diff(r1.adj_close)/r1.adj_close[:-1]
gains2[1:] = np.diff(r2.adj_close)/r2.adj_close[:-1]
r1 = mlab.rec_append_fields(r1, 'gains', gains1)
r2 = mlab.rec_append_fields(r2, 'gains', gains2)

# now join them by date; the default postfixes are 1 and 2
r = mlab.rec_join('date', r1, r2)

# long appl, short goog
g = r.gains1-r.gains2
tr = (1+g).cumprod()  # the total return

# plot the return
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(r.date, tr)
ax.set_title('total return: long appl, short goog')
ax.grid()
fig.autofmt_xdate()
plt.show()
Пример #9
0
r1 = mlab.csv2rec(open(u1[0]))
r2 = mlab.csv2rec(open(u2[0]))

# compute the daily returns and add these columns to the arrays
gains1 = np.zeros_like(r1.adj_close)
gains2 = np.zeros_like(r2.adj_close)
gains1[1:] = np.diff(r1.adj_close) / r1.adj_close[:-1]
gains2[1:] = np.diff(r2.adj_close) / r2.adj_close[:-1]
r1 = mlab.rec_append_fields(r1, 'gains', gains1)
r2 = mlab.rec_append_fields(r2, 'gains', gains2)

# now join them by date; the default postfixes are 1 and 2.  The
# default jointype is inner so it will do an intersection of dates and
# drop the dates in AAPL which occurred before GOOG started trading in
# 2004.  r1 and r2 are reverse ordered by date since Yahoo returns
# most recent first in the CSV files, but rec_join will sort by key so
# r below will be properly sorted
r = mlab.rec_join('date', r1, r2)

# long appl, short goog
g = r.gains1 - r.gains2
tr = (1 + g).cumprod()  # the total return

# plot the return
fig, ax = plt.subplots()
ax.plot(r.date, tr)
ax.set_title('total return: long APPL, short GOOG')
ax.grid()
fig.autofmt_xdate()
plt.show()
Пример #10
0
def bugtrend(milestone):
    baseWorkingDirectory = "/tmp/"
    wikiTableBaseFileName = baseWorkingDirectory + "DefectChurnReport"
    wikiImageFileBaseLocation = "http://metrics.arubanetworks.com/metrics/margot_autopages/"
    wikiContent = []
    Queries = milestone.split(",")
    default_column_value = {
        "datemaxbabug_when": datetime.date(2030, 12, 1),
        "datebcreation_ts": datetime.date(2005, 1, 1),
        "cf_customers": "Aruba Internal",
    }
    for params in Queries:
        print "processing : " + params
        wikiContent = []
        baseFileName = params
        bugReportName = baseWorkingDirectory + baseFileName + "_bugs.csv"
        fixedReportName = baseWorkingDirectory + baseFileName + "_fixed.csv"
        outputReportName = baseWorkingDirectory + baseFileName + "_merged.csv"

        r = mlab.csv2rec(bugReportName)
        s = mlab.csv2rec(fixedReportName)
        k = mlab.rec_join("bug_id", s, r, jointype="outer", defaults=default_column_value, r1postfix="1", r2postfix="2")
        t = mlab.csv2rec("/home/automation/bugzilla_tool/Org_Mapping.csv")
        # mlab.rec2csv(k,outputReportName,delimiter=',',missing="",missingd=None,withheader=True)

        org_mapping = dict(zip(t.login_name, range(len(t))))
        # orgList = []
        DirectorArray = np.zeros_like(k.login_name)
        ComponentArray = np.zeros_like(k.login_name)
        ManagerArray = np.zeros_like(k.login_name)

        for i in range(len(k)):
            if k[i].login_name in org_mapping.keys():
                DirectorArray[i] = t[org_mapping[k[i].login_name]].director
                ComponentArray[i] = t[org_mapping[k[i].login_name]].functional_group
                ManagerArray[i] = t[org_mapping[k[i].login_name]].manager
            else:
                DirectorArray[i] = t[org_mapping["*****@*****.**"]].director
                ComponentArray[i] = t[org_mapping["*****@*****.**"]].functional_group
                ManagerArray[i] = t[org_mapping["*****@*****.**"]].manager

        k = mlab.rec_append_fields(k, "Director", DirectorArray)
        k = mlab.rec_append_fields(k, "Component", ComponentArray)
        k = mlab.rec_append_fields(k, "Manager", ManagerArray)
        mlab.rec2csv(k, outputReportName, delimiter=",", missing="", missingd=None, withheader=True)

        # Start preparing the data for plotting

        chartFileName = baseWorkingDirectory + baseFileName + ".png"
        plotDefectTrend(k, chartFileName, baseFileName)

        s = "= Overall Defect Trend = \n"
        wikiContent.append(s)
        s = wikiImageFileBaseLocation + baseFileName + ".png \n"
        wikiContent.append(s)

        #    Directors = ('Murali Duvvury','Shankar','Jie Jiang')
        Directors = list(np.unique(np.array(k.Director)))

        s = "= Director level Defect Trend = \n"
        wikiContent.append(s)

        wikiTableFileName = wikiTableBaseFileName + "_" + params + ".wiki"

        f = open(wikiTableFileName, "w")
        hdrList = ("Director", "Open Defects", "Need Info", "Observe", "Resolved-Fixed", "Resolved-Other", "Incoming")
        printWikiTableOpen(f, hdrList)

        for Dir in Directors:
            s = Dir
            DirFileName = Dir.replace(" ", "_")
            DirRe = re.compile(s)
            DirReMatch = np.vectorize(lambda x: bool(DirRe.match(x)))
            sel = DirReMatch(np.array(k.Director))
            chartFileName = baseWorkingDirectory + baseFileName + "-" + DirFileName + ".png"
            plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + DirFileName)
            printChurnReport(k[sel], Dir, f)
            s = wikiImageFileBaseLocation + baseFileName + "-" + DirFileName + ".png \n"
            wikiContent.append(s)
        #        chartFileName = baseWorkingDirectory + baseFileName + "-" + Dir +".png"
        #        plotDefectTrend(k[k.Director == Dir],chartFileName, baseFileName + '-' + Dir)

        printWikiTableClose(f)

        s = "= Component level Defect Trend = \n"
        wikiContent.append(s)

        ComponentList = [
            ["GSM", "GSM"],
            ["UI-Configuration", "UI"],
            ["AP-Platform", "11ac"],
            ["Switch-Datapath", "Datapath"],
            ["HA-Lite", "HA-Lite"],
            ["Switch-Platform", "CIMU"],
            ["Feature-Bugs", "\w+]"],
        ]

        for c in ComponentList:
            s = "^\[*" + c[1]
            componentRe = re.compile(s)
            componentReMatch = np.vectorize(lambda x: bool(componentRe.match(x)))
            sel = np.logical_or(componentReMatch(np.array(k.short_desc)), k.name == c[0])
            chartFileName = baseWorkingDirectory + baseFileName + "-" + c[0] + ".png"
            plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + c[0])

        s = "= Keyword level Defect Trend = \n"
        wikiContent.append(s)

        KeywordList = [
            ["TC-Blocker", "TC\-blocker"],
            ["SystemTest", "ST"],
            ["Smoke", "Smoke\-Failure"],
            ["CFT", "CFT"],
            ["MustFix", "MustFix"],
        ]

        for keyword in KeywordList:
            s = keyword[1]
            keywordRe = re.compile(s)
            keywordReMatch = np.vectorize(lambda x: bool(keywordRe.match(x)))
            sel = keywordReMatch(np.array(k.keywords))
            chartFileName = baseWorkingDirectory + baseFileName + "-" + keyword[0] + ".png"
            plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + keyword[0])

        #   wikiTableFileName = wikiTableBaseFileName + '_' + params + '.wiki'

        #    f = open(wikiTableFileName , 'w')
        hdrList = ("Manager", "Open Defects", "Need Info", "Observe", "Resolved-Fixed", "Resolved-Other", "Incoming")
        printWikiTableOpen(f, hdrList)

        ManagerList = list(np.unique(np.array(k.Manager)))
        today = datetime.date.today()
        resolvedDateRange = today + datetime.timedelta(days=-14)

        s = "= Manager Defect Trend = \n"
        wikiContent.append(s)

        for mgr in ManagerList:
            s = mgr
            mgrFileName = mgr.replace(" ", "_")
            mgrRe = re.compile(s)
            mgrReMatch = np.vectorize(lambda x: bool(mgrRe.match(x)))
            sel = mgrReMatch(np.array(k.Manager))
            chartFileName = baseWorkingDirectory + baseFileName + "-" + mgrFileName + ".png"
            plotDefectTrend(k[sel], chartFileName, baseFileName + "-" + mgrFileName)
            printChurnReport(k[sel], mgr, f)
            s = wikiImageFileBaseLocation + baseFileName + "-" + mgrFileName + ".png \n"
            wikiContent.append(s)

        printWikiTableClose(f)
        f.write("".join(wikiContent))
        f.close()
    plt.close("all")
Пример #11
0
def customDefectTrend(chartFileName, title):
    default_column_value = {
        "datemaxbabug_when": datetime.date(2030, 12, 1),
        "datebcreation_ts": datetime.date(2005, 1, 1),
        "cf_customers": "Aruba Internal",
    }
    cfdtuple, cfdclose = cfdbug()
    k = mlab.rec_join(
        "bug_id", cfdtuple, cfdclose, jointype="outer", defaults=default_column_value, r1postfix="1", r2postfix="2"
    )
    incomingRateList = []
    fixRateList = []
    fixWeekList = []
    resolvedFixedRateList = []
    outstandingList = []
    today = datetime.date.today()
    toRangeDate = today + datetime.timedelta(days=-today.weekday() - 1, weeks=1)
    fromRangeDate = toRangeDate + relativedelta.relativedelta(weeks=-13)
    iterDate = fromRangeDate
    while iterDate <= toRangeDate:
        curDate = iterDate
        iterDate = iterDate + relativedelta.relativedelta(weeks=1)
        fixWeekList.append(iterDate.strftime("%m/%d"))
        fixRate = np.logical_and(
            np.logical_and(k.datemaxbabug_when > curDate, k.datemaxbabug_when <= iterDate), k.resolution <> ""
        ).sum()
        incomingRate = (np.logical_and(k.datebcreation_ts > curDate, k.datebcreation_ts <= iterDate)).sum()
        resolvedFixedRate = np.logical_and(
            np.logical_and(k.datemaxbabug_when > curDate, k.datemaxbabug_when <= iterDate), k.resolution == "FIXED"
        ).sum()
        outstandingCount = (k.datebcreation_ts <= iterDate).sum() - np.logical_and(
            k.datemaxbabug_when <= iterDate, k.resolution <> ""
        ).sum()
        fixRateList.append(fixRate)
        incomingRateList.append(incomingRate)
        resolvedFixedRateList.append(resolvedFixedRate)
        outstandingList.append(outstandingCount)
    width = 0.35
    ind = np.arange(len(fixWeekList))
    ax = plt.subplot(111)
    rects1 = plt.bar(ind, incomingRateList, width, color="r")
    rects2 = plt.bar(ind + width, fixRateList, width, color="y")
    rects3 = plt.bar(ind + width, resolvedFixedRateList, width, color="b")
    ax2 = ax.twinx()
    rects5 = ax2.plot(ind + width, outstandingList, "b-", linewidth=3)
    plt.ylabel("Defect Count")
    ax.set_ylabel(r"Opened / Verified / Resolved")
    ax2.set_ylabel(r"Outstanding Defect Count")
    plt.title(title + " Defect Trend" + "-%s" % (datetime.date.today()))
    plt.xticks(ind + width, fixWeekList)
    # plt.legend((rects1[0], rects2[0] ,rects3[0],rects4[0],rects5[0],rects6[0],rects7[0],rects8[0]), ('incoming', 'resolved-other', 'resolved-fixed','regression','outstanding (right-axis)','SS_P1 (right-axis)','MustFix (right-axis)','Overall (right-axis)'),loc="upper left")
    plt.legend(
        (rects1[0], rects2[0], rects3[0], rects5[0]),
        ("Opend", "Verified", "Resolved", "outstanding (right-axis)"),
        loc="upper left",
    )
    leg = plt.gca().get_legend()
    ltext = leg.get_texts()
    plt.setp(ltext, fontsize="xx-small")

    autolabel(rects1, ax)
    autolabel(rects2, ax)

    plt.setp(ax.get_xticklabels(), rotation="90", fontsize=12)

    F = plt.gcf()
    F.set_size_inches(8, 6)
    F.savefig(chartFileName, dpi=(100))
    plt.clf()
Пример #12
0
from __future__ import print_function
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.cbook as cbook

datafile = cbook.get_sample_data('aapl.csv', asfileobj=False)
print('loading', datafile)
r = mlab.csv2rec(datafile)

r.sort()
r1 = r[-10:]

# Create a new array
r2 = np.empty(12, dtype=[('date', '|O4'), ('high', np.float),
                         ('marker', np.float)])
r2 = r2.view(np.recarray)
r2.date = r.date[-17:-5]
r2.high = r.high[-17:-5]
r2.marker = np.arange(12)

print("r1:")
print(mlab.rec2txt(r1))
print("r2:")
print(mlab.rec2txt(r2))

defaults = {'marker': -1, 'close': np.NaN, 'low': -4444.}

for s in ('inner', 'outer', 'leftouter'):
    rec = mlab.rec_join(['date', 'high'], r1, r2,
                        jointype=s, defaults=defaults)
    print("\n%sjoin :\n%s" % (s, mlab.rec2txt(rec)))
Пример #13
0
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict(
            (getattr(x, id_field), x.LOC_ID) for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [
            x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1
        ]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k,
                id_field=id_field,
                independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                                     min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k),
                                             stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                                         min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join(
                    (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field,
                                      k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                                                        attr, stat.upper(),
                                                        ks_stats[stat])
                    stats_fh.write(stat_line)
    def run_diagnostic(self):

        # Shortcut to the parameter parser
        p = self.parameter_parser

        # ID field
        id_field = p.summary_level + 'ID'

        # Root directory for Riemann files
        root_dir = p.riemann_output_folder

        # Read in hex input file
        obs_data = utilities.csv2rec(self.hex_attribute_file)

        # Get the hexagon levels and ensure that the fields exist in the
        # hex_attribute file
        hex_resolutions = p.riemann_hex_resolutions
        hex_fields = [x[0] for x in hex_resolutions]
        for field in hex_fields:
            if field not in obs_data.dtype.names:
                err_msg = 'Field ' + field + ' does not exist in the '
                err_msg += 'hex_attribute file'
                raise ValueError(err_msg)

        # Create the directory structure based on the hex levels
        hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions]
        all_levels = ['plot_pixel'] + hex_levels
        for level in all_levels:
            sub_dir = os.path.join(root_dir, level)
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)

        # Get the values of k
        k_values = p.riemann_k_values

        # Create a dictionary of plot ID to image year (or model_year for
        # non-imagery models) for these plots
        if p.model_type in p.imagery_model_types:
            id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data)
        else:
            id_x_year = dict((x[id_field], p.model_year) for x in obs_data)

        # Create a PredictionRun instance
        pr = prediction_run.PredictionRun(p)

        # Get the neighbors and distances for these IDs
        pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field)

        # Create the lookup of id_field to LOC_ID for the hex plots
        nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data)

        # Create a dictionary between id_field and no_self_assign_field
        # for the model plots
        env_file = p.environmental_matrix_file
        env_data = utilities.csv2rec(env_file)
        model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID)
            for x in env_data)

        # Stitch the two dictionaries together
        for id in sorted(model_nsa_id_dict.keys()):
            if id not in nsa_id_dict:
                nsa_id_dict[id] = model_nsa_id_dict[id]

        # Get the stand attribute metadata and retrieve only the
        # continuous accuracy attributes
        stand_metadata_file = p.stand_metadata_file
        mp = xsmp.XMLStandMetadataParser(stand_metadata_file)
        attrs = [x.field_name for x in mp.attributes
            if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1]

        # Subset the attributes for fields that are in the
        # hex_attribute file
        attrs = [x for x in attrs if x in obs_data.dtype.names]
        plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs)

        # Write out the plot_pixel observed file
        file_name = 'plot_pixel_observed.csv'
        output_file = os.path.join(root_dir, 'plot_pixel', file_name)
        utilities.rec2csv(plot_pixel_obs, output_file)

        # Iterate over values of k
        for k in k_values:

            # Construct the output file name
            file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k)))
            file_name += '.csv'
            output_file = os.path.join(root_dir, 'plot_pixel', file_name)
            out_fh = open(output_file, 'w')

            # For the plot/pixel scale, retrieve the independent predicted
            # data for this value of k.  Even though attributes are being
            # returned from this function, we want to use the attribute list
            # that we've already found above.
            prediction_generator = pr.calculate_predictions_at_k(
                k=k, id_field=id_field, independent=True,
                nsa_id_dict=nsa_id_dict)

            # Write out the field names
            out_fh.write(id_field + ',' + ','.join(attrs) + '\n')

            # Write out the predictions for this k
            for plot_prediction in prediction_generator:

                # Write this record to the predicted attribute file
                pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs)

            # Close this file
            out_fh.close()

        # Create the fields for which to extract statistics at the hexagon
        # levels
        mean_fields = [(id_field, len, 'PLOT_COUNT')]
        mean_fields.extend([(x, np.mean, x) for x in attrs])
        mean_fields = tuple(mean_fields)

        sd_fields = [(id_field, len, 'PLOT_COUNT')]
        sd_fields.extend([(x, np.std, x) for x in attrs])
        sd_fields = tuple(sd_fields)

        stat_sets = {
            'mean': mean_fields,
            'std': sd_fields,
        }

        # For each hexagon level, associate the plots with their hexagon ID
        # and find observed and predicted statistics for each hexagon
        for hex_resolution in hex_resolutions:

            (hex_id_field, hex_distance) = hex_resolution[0:2]
            min_plots_per_hex = hex_resolution[3]
            prefix = 'hex_' + str(hex_distance)

            # Create a crosswalk between the id_field and the hex_id_field
            id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field])

            # Iterate over all sets of statistics and write a unique file
            # for each set
            for (stat_name, stat_fields) in stat_sets.iteritems():

                # Get the output file name
                obs_out_file = \
                    '_'.join((prefix, 'observed', stat_name)) + '.csv'
                obs_out_file = os.path.join(root_dir, prefix, obs_out_file)

                # Write out the observed file
                self.write_hex_stats(obs_data, hex_id_field, stat_fields,
                    min_plots_per_hex, obs_out_file)

            # Iterate over values of k for the predicted values
            for k in k_values:

                # Open the plot_pixel predicted file for this value of k
                # and join the hex_id_field to the recarray
                prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
                prd_file = os.path.join(root_dir, 'plot_pixel', prd_file)
                prd_data = utilities.csv2rec(prd_file)
                prd_data = mlab.rec_join(id_field, prd_data, id_x_hex)

                # Iterate over all sets of statistics and write a unique file
                # for each set
                for (stat_name, stat_fields) in stat_sets.iteritems():

                    # Get the output file name
                    prd_out_file = '_'.join((
                        prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv'
                    prd_out_file = os.path.join(root_dir, prefix, prd_out_file)

                    # Write out the predicted file
                    self.write_hex_stats(prd_data, hex_id_field, stat_fields,
                        min_plots_per_hex, prd_out_file)

        # Calculate the ECDF and AC statistics
        # For ECDF and AC, it is a paired comparison between the observed
        # and predicted data.  We do this at each value of k and for each
        # hex resolution level.

        # Open the stats file
        stats_file = p.hex_statistics_file
        stats_fh = open(stats_file, 'w')
        header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE']
        stats_fh.write(','.join(header_fields) + '\n')

        # Create a list of RiemannComparison instances which store the
        # information needed to do comparisons between observed and predicted
        # files for any level or value of k
        compare_list = []
        for hex_resolution in hex_resolutions:
            (hex_id_field, hex_distance) = hex_resolution[0:2]
            prefix = 'hex_' + str(hex_distance)
            obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv'
            obs_file = os.path.join(root_dir, prefix, obs_file)
            for k in k_values:
                prd_file = '_'.join((
                    prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv'
                prd_file = os.path.join(root_dir, prefix, prd_file)
                r = RiemannComparison(
                    prefix, obs_file, prd_file, hex_id_field, k)
                compare_list.append(r)

        # Add the plot_pixel comparisons to this list
        prefix = 'plot_pixel'
        obs_file = 'plot_pixel_observed.csv'
        obs_file = os.path.join(root_dir, prefix, obs_file)
        for k in k_values:
            prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv'
            prd_file = os.path.join(root_dir, prefix, prd_file)
            r = RiemannComparison(prefix, obs_file, prd_file, id_field, k)
            compare_list.append(r)

        # Do all the comparisons
        for c in compare_list:

            # Open the observed file
            obs_data = utilities.csv2rec(c.obs_file)

            # Open the predicted file
            prd_data = utilities.csv2rec(c.prd_file)

            # Ensure that the IDs between the observed and predicted
            # data line up
            ids1 = getattr(obs_data, c.id_field)
            ids2 = getattr(prd_data, c.id_field)
            if np.all(ids1 != ids2):
                err_msg = 'IDs do not match between observed and '
                err_msg += 'predicted data'
                raise ValueError(err_msg)

            for attr in attrs:
                arr1 = getattr(obs_data, attr)
                arr2 = getattr(prd_data, attr)
                rv = RiemannVariable(arr1, arr2)

                gmfr_stats = rv.gmfr_statistics()
                for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), gmfr_stats[stat])
                    stats_fh.write(stat_line)

                ks_stats = rv.ks_statistics()
                for stat in ('ks_max', 'ks_mean'):
                    stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k,
                        attr, stat.upper(), ks_stats[stat])
                    stats_fh.write(stat_line)