def run_diagnostic(self): # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Calculate VEGCLASS for both the observed and predicted data vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field) # Print out the vegclass file vc_fh = open(self.vegclass_file, 'w') vc_fh.write(','.join((self.id_field, 'OBSERVED', 'PREDICTED')) + '\n') # Print out the observed and predicted vegetation classes for id_val in sorted(vc_dict.keys()): obs_vc = vc_dict[id_val]['obs_vc'] prd_vc = vc_dict[id_val]['prd_vc'] out_list = ['%d' % x for x in (id_val, obs_vc, prd_vc)] vc_fh.write(','.join(out_list) + '\n') vc_fh.close() # Create the vegetation class kappa and error matrix files vc_xml = 'L:/resources/code/xml/vegclass.xml' ca.classification_accuracy( self.vegclass_file, vc_xml, kappa_file=self.vegclass_kappa_file, err_matrix_file=self.vegclass_errmatrix_file)
def run_diagnostic(self): # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d( getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Calculate VEGCLASS for both the observed and predicted data vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field) # Print out the vegclass file vc_fh = open(self.vegclass_file, 'w') vc_fh.write(','.join((self.id_field, 'OBSERVED', 'PREDICTED')) + '\n') # Print out the observed and predicted vegetation classes for id_val in sorted(vc_dict.keys()): obs_vc = vc_dict[id_val]['obs_vc'] prd_vc = vc_dict[id_val]['prd_vc'] out_list = ['%d' % x for x in (id_val, obs_vc, prd_vc)] vc_fh.write(','.join(out_list) + '\n') vc_fh.close() # Create the vegetation class kappa and error matrix files vc_xml = 'L:/resources/code/xml/vegclass.xml' ca.classification_accuracy(self.vegclass_file, vc_xml, kappa_file=self.vegclass_kappa_file, err_matrix_file=self.vegclass_errmatrix_file)
def run_diagnostic(self): # Open the outlier file and write the header line vc_outlier_fh = open(self.vegclass_outlier_file, "w") header_fields = (self.id_field, "PREDICTION_TYPE", "OBSERVED_VEGCLASS", "PREDICTED_VEGCLASS", "OUTLIER_TYPE") vc_outlier_fh.write(",".join(header_fields) + "\n") # Run this for both independent and dependent predictions for (prd_type, prd_file) in self.predicted_files: # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(prd_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Calculate VEGCLASS for both the observed and predicted data vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field) # Find the outliers outliers = self.find_vegclass_outliers(vc_dict) # Print out the outliers for outlier in outliers: (id, obs_vc, prd_vc, outlier_type) = outlier out_fields = ("%d" % id, "%s" % prd_type.upper(), "%d" % obs_vc, "%d" % prd_vc, "%s" % outlier_type) vc_outlier_fh.write(",".join(out_fields) + "\n") vc_outlier_fh.close()
def run_diagnostic(self): # Open the output file and write the header out_fh = open(self.vd_output_file, 'w') out_fh.write( '%s,PREDICTION_TYPE,VARIABLE,OBSERVED_VALUE,PREDICTED_VALUE\n' % \ self.id_field) # Run this for both independent and dependent predictions for (prd_type, prd_file) in self.predicted_files: # Read the observed and predicted files into numpy recarrays obs_data = utilities.csv2rec(self.observed_file) prd_data = utilities.csv2rec(prd_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs_data, self.id_field), getattr(prd_data, self.id_field)) obs_data = obs_data[obs_keep] # Iterate over the list of deviation variables, capturing the plots # that exceed the minimum threshold specified outliers = {} for (variable, min_deviation) in self.deviation_variables: obs_vals = getattr(obs_data, variable) prd_vals = getattr(prd_data, variable) abs_diff_vals = np.abs(obs_vals - prd_vals) indexes = np.argwhere(abs_diff_vals >= min_deviation) outliers[variable] = indexes # Create the file of outliers for (variable, min_deviation) in self.deviation_variables: outlier_list = outliers[variable] for index in outlier_list: obs_row = obs_data[index] prd_row = prd_data[index] id = getattr(obs_row, self.id_field) obs_val = getattr(obs_row, variable) prd_val = getattr(prd_row, variable) diff_val = obs_val - prd_val out_data = [ '%d' % id, '%s' % prd_type.upper(), '%s' % variable, '%.4f' % obs_val, '%.4f' % prd_val, '%.4f' % diff_val, ] out_fh.write(','.join(out_data) + '\n') # Clean up out_fh.close()
def _create_scatterplots(self): # Open files into recarrays obs_data = utilities.csv2rec(self.observed_file) prd_data = utilities.csv2rec(self.predicted_file) # Subset the obs_data to just those IDs in the predicted data ids1 = getattr(obs_data, self.id_field) ids2 = getattr(prd_data, self.id_field) common_ids = np.in1d(ids1, ids2) obs_data = obs_data[common_ids] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Subset the attributes to those that are continuous, are accuracy # attributes, are identified to go into the report, and are not # species variables attrs = [] for attr in mp.attributes: if ( attr.field_type == "CONTINUOUS" and attr.project_attr == 1 and attr.accuracy_attr == 1 and attr.species_attr == 0 ): attrs.append(attr.field_name) # Iterate over the attributes and create a scatterplot file of each scatter_files = [] for attr in attrs: # Metadata for this attribute metadata = mp.get_attribute(attr) # Observed and predicted data matrices for this attribute obs_vals = getattr(obs_data, attr) prd_vals = getattr(prd_data, attr) # Create the output file name output_file = attr.lower() + "_scatter.png" # Create the scatterplot mplf.draw_scatterplot(obs_vals, prd_vals, metadata, output_type=mplf.FILE, output_file=output_file) # Add this to the list of scatterplot files scatter_files.append(output_file) # Return the list of scatterplots just created return scatter_files
def _create_scatterplots(self): # Open files into recarrays obs_data = utilities.csv2rec(self.observed_file) prd_data = utilities.csv2rec(self.predicted_file) # Subset the obs_data to just those IDs in the predicted data ids1 = getattr(obs_data, self.id_field) ids2 = getattr(prd_data, self.id_field) common_ids = np.in1d(ids1, ids2) obs_data = obs_data[common_ids] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Subset the attributes to those that are continuous, are accuracy # attributes, are identified to go into the report, and are not # species variables attrs = [] for attr in mp.attributes: if attr.field_type == 'CONTINUOUS' and attr.project_attr == 1 and \ attr.accuracy_attr == 1 and attr.species_attr == 0: attrs.append(attr.field_name) # Iterate over the attributes and create a scatterplot file of each scatter_files = [] for attr in attrs: # Metadata for this attribute metadata = mp.get_attribute(attr) # Observed and predicted data matrices for this attribute obs_vals = getattr(obs_data, attr) prd_vals = getattr(prd_data, attr) # Create the output file name output_file = attr.lower() + '_scatter.png' # Create the scatterplot mplf.draw_scatterplot(obs_vals, prd_vals, metadata, output_type=mplf.FILE, output_file=output_file) # Add this to the list of scatterplot files scatter_files.append(output_file) # Return the list of scatterplots just created return scatter_files
def _get_attribute_data(self): """ Retrieve the attribute data for which predictions will be made. This should be called one time, after which it is stored in an instance- level variable along with the attribute names (self.attrs) Parameters ---------- None Returns ------- stand_attr_data : numpy recarray Recarray with all stand attributes attrs : list of strs List of all continuous variables in stand_attr_data """ # Get the stand attribute table and read into a recarray p = self.parameter_parser stand_attr_file = p.stand_attribute_file stand_attr_data = utilities.csv2rec(stand_attr_file) # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1] return (stand_attr_data, attrs)
def setUp(self): self.data_fn = 'data/vegclass.csv' self.classifier_fn = 'data/vegclass.xml' self.e_ref = 'data/vegclass_errmat.csv' self.k_ref = 'data/vegclass_kappa.csv' self.data = utilities.csv2rec(self.data_fn) self.classifier = ca.Classifier.from_xml(self.classifier_fn)
def _get_attribute_data(self): """ Retrieve the attribute data for which predictions will be made. This should be called one time, after which it is stored in an instance- level variable along with the attribute names (self.attrs) Parameters ---------- None Returns ------- stand_attr_data : numpy recarray Recarray with all stand attributes attrs : list of strs List of all continuous variables in stand_attr_data """ # Get the stand attribute table and read into a recarray p = self.parameter_parser stand_attr_file = p.stand_attribute_file stand_attr_data = utilities.csv2rec(stand_attr_file) # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [ x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1 ] return (stand_attr_data, attrs)
def classification_accuracy(input_file, classifier_file, kappa_file=None, err_matrix_file=None, observed_column='OBSERVED', predicted_column='PREDICTED'): """ Wrapper function to read in a plot-by-classification file of observed and predicted values and a classifier XML file and return output kappa statistics and error matrix for a given variable Parameters ---------- input_file : file The input file (comma-separated-value format) with the observed and predicted classified values for all plots. The file must have a header line with column names. Specify the names for the observed and predicted columns using the 'observed_column' and 'predicted_column' keyword parameters. classifier_file : file An XML file that describes the variable classification including information on fuzzy sets. This file must validate against 'classifier.xsd'. kappa_file : file Output file to hold kappa and fuzzy kappa statistics. Defaults to None (ie. not output). err_matrix_file : file Output file to hold error matrix statistics. Default to None (ie. not output). observed_column : string The name of the observed column in the input_file. Defaults to 'OBSERVED' predicted_column : string The name of the predicted column in the input_file. Defaults to 'PREDICTED' Returns ------- None """ # Read in the raw input file csv = utilities.csv2rec(input_file) obs_data = csv[observed_column] prd_data = csv[predicted_column] # Read in the classification c = Classifier.from_xml(classifier_file) # Print classification kappas if kappa_file is not None: print_kappa_file(obs_data, prd_data, c, kappa_file) # Print classification error matrix if err_matrix_file is not None: print_error_matrix_file(obs_data, prd_data, c, err_matrix_file)
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # Read in the validation plots file validation_plots = utilities.csv2rec(self.observed_file) # Create a dictionary of plot ID to image year for these plots id_x_year = \ dict((x[self.id_field], x.IMAGE_YEAR) for x in validation_plots) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=self.id_field) # Retrieve the predicted data for these plots. In essence, we can # retrieve the dependent neighbors because these plot IDs are # guaranteed not to be in the model prediction_generator = pr.calculate_predictions_at_k(k=p.k, id_field=self.id_field, independent=False) # Open the predicted file and write out the field names out_fh = open(self.predicted_file, 'w') out_fh.write(self.id_field + ',' + ','.join(pr.attrs) + '\n') # Write out the predictions for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh) # Close this file out_fh.close() # Run the LocalAccuracyDiagnostic on these files d = lad.LocalAccuracyDiagnostic( observed_file=self.observed_file, independent_predicted_file=self.predicted_file, stand_metadata_file=self.stand_metadata_file, local_accuracy_file=self.local_accuracy_file, id_field=self.id_field ) d.run_diagnostic() # Run the VegetationClassDiagnostic on these files d = vcd.VegetationClassDiagnostic( observed_file=self.observed_file, independent_predicted_file=self.predicted_file, vegclass_file=self.vegclass_file, vegclass_kappa_file=self.vegclass_kappa_file, vegclass_errmatrix_file=self.vegclass_errmatrix_file, id_field=self.id_field, ) d.run_diagnostic()
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # Read in the validation plots file validation_plots = utilities.csv2rec(self.observed_file) # Create a dictionary of plot ID to image year for these plots id_x_year = \ dict((x[self.id_field], x.IMAGE_YEAR) for x in validation_plots) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=self.id_field) # Retrieve the predicted data for these plots. In essence, we can # retrieve the dependent neighbors because these plot IDs are # guaranteed not to be in the model prediction_generator = pr.calculate_predictions_at_k( k=p.k, id_field=self.id_field, independent=False) # Open the predicted file and write out the field names out_fh = open(self.predicted_file, 'w') out_fh.write(self.id_field + ',' + ','.join(pr.attrs) + '\n') # Write out the predictions for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh) # Close this file out_fh.close() # Run the LocalAccuracyDiagnostic on these files d = lad.LocalAccuracyDiagnostic( observed_file=self.observed_file, independent_predicted_file=self.predicted_file, stand_metadata_file=self.stand_metadata_file, local_accuracy_file=self.local_accuracy_file, id_field=self.id_field) d.run_diagnostic() # Run the VegetationClassDiagnostic on these files d = vcd.VegetationClassDiagnostic( observed_file=self.observed_file, independent_predicted_file=self.predicted_file, vegclass_file=self.vegclass_file, vegclass_kappa_file=self.vegclass_kappa_file, vegclass_errmatrix_file=self.vegclass_errmatrix_file, id_field=self.id_field, ) d.run_diagnostic()
def run_diagnostic(self): # Read in the dependent nn_index_file in_data = utilities.csv2rec(self.nn_index_file) # Subset the observed data to just those values above the # index threshold in_data = in_data[in_data.AVERAGE_POSITION >= self.index_threshold] # Write out the resulting recarray utilities.rec2csv(in_data, self.nn_index_outlier_file)
def list_points(self): points = [] if self.domain == 'list': child_elem = (self.domain_element.getchildren())[0] if child_elem.tag == 'points': points = \ [(point.x, point.y) for point in child_elem.getchildren()] else: recs = utilities.csv2rec(str(child_elem)) points = [(point.X, point.Y) for point in recs] return points
def load_outliers(self): p = self.parameter_parser #read in outlier files and push results to DB for d in p.outlier_diagnostics: outlier_diag = (self.diagnostic_type[d])(p) outlier_file = outlier_diag.get_outlier_filename() outlier_formatter = (self.outlier_formatter[d])(p) out_rec = utilities.csv2rec(outlier_file) if out_rec is not None: outlier_formatter.load_outliers(out_rec)
def create_predictions(self, no_self_assign_field='LOC_ID'): """ Creates model predictions and zonal pixel files from independent predictions, ie. plots are not able to use themselves (or other 'dependent' plots) as neighbors Parameters ---------- no_self_assign_field : str ID field at which no self assignment is allowed. Defaults to LOC_ID Returns ------- None """ # Aliases p = self.parameter_parser pr = self.prediction_run # Create a dictionary between id_field and no_self_assign_field env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) nsaf = no_self_assign_field nsa_id_dict = dict( (getattr(x, self.id_field), getattr(x, nsaf)) for x in env_data) # Open the prediction files zonal_pixel_file = p.independent_zonal_pixel_file predicted_file = p.independent_predicted_file zonal_pixel_fh, predicted_fh = \ self.open_prediction_files(zonal_pixel_file, predicted_file) # Create a generator for each ID in pr.neighbor_data prediction_generator = pr.calculate_predictions_at_k( k=p.k, id_field=self.id_field, independent=True, nsa_id_dict=nsa_id_dict) # Iterate over each prediction writing them out to the zonal pixel # and predicted attribute files for plot_prediction in prediction_generator: # Write this record to the zonal pixel file pr.write_zonal_pixel_record(plot_prediction, zonal_pixel_fh) # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, predicted_fh) # Close files zonal_pixel_fh.close() predicted_fh.close()
def run_diagnostic(self): # Open the outlier file and write the header line vc_outlier_fh = open(self.vegclass_outlier_file, 'w') header_fields = (self.id_field, 'PREDICTION_TYPE', 'OBSERVED_VEGCLASS', 'PREDICTED_VEGCLASS', 'OUTLIER_TYPE') vc_outlier_fh.write(','.join(header_fields) + '\n') # Run this for both independent and dependent predictions for (prd_type, prd_file) in self.predicted_files: # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(prd_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Calculate VEGCLASS for both the observed and predicted data vc_dict = self.vegclass_aa(obs, prd, id_field=self.id_field) # Find the outliers outliers = self.find_vegclass_outliers(vc_dict) # Print out the outliers for outlier in outliers: (id, obs_vc, prd_vc, outlier_type) = outlier out_fields = ( '%d' % id, '%s' % prd_type.upper(), '%d' % obs_vc, '%d' % prd_vc, '%s' % outlier_type, ) vc_outlier_fh.write(','.join(out_fields) + '\n') vc_outlier_fh.close()
def _create_histograms(self): # Open the area estimate file into a recarray ae_data = utilities.csv2rec(self.regional_accuracy_file) # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Subset the attributes to those that are accuracy attributes, # are identified to go into the report, and are not species variables attrs = [] for attr in mp.attributes: if attr.accuracy_attr == 1 and attr.project_attr == 1 and \ attr.species_attr == 0: attrs.append(attr.field_name) # Iterate over the attributes and create a histogram file of each histogram_files = [] for attr in attrs: # Metadata for this attribute metadata = mp.get_attribute(attr) # Get the observed and predicted data for this attribute obs_vals = self._get_histogram_data(ae_data, attr, 'OBSERVED') prd_vals = self._get_histogram_data(ae_data, attr, 'PREDICTED') # Set the areas for the observed and predicted data obs_area = obs_vals.AREA prd_area = prd_vals.AREA # Set the bin names (same for both observed and predicted series) bin_names = obs_vals.BIN_NAME if np.all(bin_names != prd_vals.BIN_NAME): err_msg = 'Bin names are not the same for ' + attr raise ValueError(err_msg) # Create the output file name output_file = attr.lower() + '_histogram.png' # Create the histogram mplf.draw_histogram([obs_area, prd_area], bin_names, metadata, output_type=mplf.FILE, output_file=output_file) # Add this to the list of histogram files histogram_files.append(output_file) # Return the list of histograms just created return histogram_files
def create_predictions(self, no_self_assign_field='LOC_ID'): """ Creates model predictions and zonal pixel files from independent predictions, ie. plots are not able to use themselves (or other 'dependent' plots) as neighbors Parameters ---------- no_self_assign_field : str ID field at which no self assignment is allowed. Defaults to LOC_ID Returns ------- None """ # Aliases p = self.parameter_parser pr = self.prediction_run # Create a dictionary between id_field and no_self_assign_field env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) nsaf = no_self_assign_field nsa_id_dict = dict((getattr(x, self.id_field), getattr(x, nsaf)) for x in env_data) # Open the prediction files zonal_pixel_file = p.independent_zonal_pixel_file predicted_file = p.independent_predicted_file zonal_pixel_fh, predicted_fh = \ self.open_prediction_files(zonal_pixel_file, predicted_file) # Create a generator for each ID in pr.neighbor_data prediction_generator = pr.calculate_predictions_at_k( k=p.k, id_field=self.id_field, independent=True, nsa_id_dict=nsa_id_dict) # Iterate over each prediction writing them out to the zonal pixel # and predicted attribute files for plot_prediction in prediction_generator: # Write this record to the zonal pixel file pr.write_zonal_pixel_record(plot_prediction, zonal_pixel_fh) # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, predicted_fh) # Close files zonal_pixel_fh.close() predicted_fh.close()
def run_diagnostic(self): # Open the stand attribute file and subset to just positive IDs attr_data = utilities.csv2rec(self.stand_attr_file) cond = np.where(getattr(attr_data, self.id_field) > 0) attr_data = attr_data[cond] # Create a simple dictionary of ID to vegetation class from the # attr_data vc_dict = dict((getattr(x, self.id_field), getattr(x, 'VEGCLASS')) for x in attr_data) # Open the output file and write the header out_fh = open(self.output_file, 'w') out_fh.write('%s,PREDICTION_TYPE\n' % self.id_field) # Run this for both independent and dependent predictions for (prd_type, zp_file) in self.zonal_pixel_files: # Open the zonal pixel file zonal_data = utilities.csv2rec(zp_file) # For each ID in zonal_data, retrieve the vegetation class of its # neighbors ids = getattr(attr_data, self.id_field) for id in ids: cond = np.where(getattr(zonal_data, self.id_field) == id) zonal_records = zonal_data[cond] vc_records = [vc_dict[x] for x in zonal_records.NEIGHBOR_ID] # Apply the logic for the variety outlier = self.calculate_vc_variety(vc_records) if outlier: out_fh.write('%d,%s\n' % (id, prd_type.upper())) # Clean up out_fh.close()
def calculate_neighbors_cross_validation(self): """ Wrapper around get_predicted_neighbors_at_ids optimized for cross- validation (ie. using plots that went into model development). Parameters ---------- None Returns ------- None """ # Alias for self.parameter_parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Get the environmental matrix file and read the plot IDs # and image years into a dictionary env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) # Associate each plot with a model year; this is either the year the # model is associated with (for models that use imagery), or the # model_year (for models that don't use imagery) if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in env_data) else: id_x_year = dict((x[id_field], p.model_year) for x in env_data) # Subset the plot ID list down to just those plots that went into # imputation. This may be a subset of the plots that are in the # environmental matrix file based on running GNN in a unique way. # This requires parsing the model and extracting just the plot IDs ord_file = p.get_ordination_file() lop = lemma_ordination_parser.LemmaOrdinationParser() ord_model = lop.parse(ord_file, delimiter=',') plot_ids = ord_model.plot_ids id_x_year = dict( (i, id_x_year[i]) for i in id_x_year.keys() if i in plot_ids) # Call the main function self.calculate_neighbors_at_ids(id_x_year, id_field=id_field)
def get_observed_estimates(self): # Read the area estimate file into a recarray obs_data = utilities.csv2rec(self.area_estimate_file) # Get the nonforest hectares (coded as -10001) nf_row = obs_data[getattr(obs_data, self.id_field) == -10001][0] nf_hectares = nf_row.HECTARES # Get the nonsampled hectares (coded as -10002) ns_row = obs_data[getattr(obs_data, self.id_field) == -10002][0] ns_hectares = ns_row.HECTARES # Remove these rows from the recarray obs_data = obs_data[getattr(obs_data, self.id_field) > 0] # Return this information return obs_data, nf_hectares, ns_hectares
def get_predicted_estimates(self): # Read in the predicted raster ds = gdal.Open(self.predicted_raster, gdalconst.GA_ReadOnly) rat = ds.GetRasterBand(1).GetDefaultRAT() # Get the cell area for converting from pixel counts to hectares gt = ds.GetGeoTransform() cell_area = gt[1] * gt[1] # Get the IDs and counts (converted to hectares) id_recs = [] nf_hectares = 0 for i in range(rat.GetRowCount()): id = rat.GetValueAsInt(i, 0) hectares = rat.GetValueAsInt(i, 1) * cell_area / 10000.0 if id <= 0: nf_hectares += hectares else: id_recs.append((id, hectares)) # Release the dataset ds = None # Convert this to a recarray names = (self.id_field, 'HECTARES') ids = np.rec.fromrecords(id_recs, names=names) # Read in the attribute file sad = utilities.csv2rec(self.stand_attribute_file) # Ensure that all IDs in the id_count_dict are in the attribute data ids_1 = getattr(ids, self.id_field) ids_2 = getattr(sad, self.id_field) if not np.all(np.in1d(ids_1, ids_2)): err_msg = 'Not all values in the raster are present in the ' err_msg += 'attribute data' raise ValueError(err_msg) # Join the two recarrays together predicted_data = mlab.rec_join(self.id_field, ids, sad) return (predicted_data, nf_hectares)
def _create_story(self): # Set up an empty list to hold the story story = [] # Import the report styles styles = report_styles.get_report_styles() # Create a page break story = self._make_page_break(story, self.PORTRAIT) # Section title title_str = '<strong>Local-Scale Accuracy Assessment:<br/>' title_str += 'Species Accuracy at Plot Locations' title_str += '</strong>' para = p.Paragraph(title_str, styles['section_style']) t = p.Table([[para]], colWidths=[7.5 * u.inch]) t.setStyle( p.TableStyle([ ('TOPPADDING', (0, 0), (-1, -1), 6), ('BOTTOMPADDING', (0, 0), (-1, -1), 6), ('BACKGROUND', (0, 0), (-1, -1), '#957348'), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('GRID', (0, 0), (-1, -1), 0.25, colors.black), ])) story.append(t) story.append(p.Spacer(0, 0.2 * u.inch)) # Kappa explanation kappa_str = ''' Cohen's kappa coefficient (Cohen, 1960) is a statistical measure of reliability, accounting for agreement occurring by chance. The equation for kappa is: ''' para = p.Paragraph(kappa_str, styles['body_style']) story.append(para) story.append(p.Spacer(0, 0.05 * u.inch)) kappa_str = ''' kappa = (Pr(a) - Pr(e)) / (1.0 - Pr(e)) ''' para = p.Paragraph(kappa_str, styles['indented']) story.append(para) story.append(p.Spacer(0, 0.05 * u.inch)) kappa_str = ''' where Pr(a) is the relative observed agreement among raters, and Pr(e) is the probability that agreement is due to chance.<br/><br/> <strong>Abbreviations Used:</strong><br/> OP/PP = Observed Present / Predicted Present<br/> OA/PP = Observed Absent / Predicted Present (errors of commission)<br/> OP/PA = Observed Present / Predicted Absent (errors of ommission)<br/> OA/PA = Observed Absent / Predicted Absent ''' para = p.Paragraph(kappa_str, styles['body_style']) story.append(para) story.append(p.Spacer(0, 0.2 * u.inch)) # Create a list of lists to hold the species accuracy information species_table = [] # Header row header_row = [] spp_str = '<strong>Species PLANTS Code<br/>' spp_str += 'Scientific Name / Common Name</strong>' para = p.Paragraph(spp_str, styles['body_style_10']) header_row.append(para) spp_str = '<strong>Species prevalence</strong>' para = p.Paragraph(spp_str, styles['body_style_10']) header_row.append(para) p1 = p.Paragraph('<strong>OP/PP</strong>', styles['body_style_10_right']) p2 = p.Paragraph('<strong>OP/PA</strong>', styles['body_style_10_right']) p3 = p.Paragraph('<strong>OA/PP</strong>', styles['body_style_10_right']) p4 = p.Paragraph('<strong>OA/PA</strong>', styles['body_style_10_right']) header_cells = [[p1, p2], [p3, p4]] t = p.Table(header_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch]) t.setStyle( p.TableStyle([ ('GRID', (0, 0), (-1, -1), 1, colors.white), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ])) header_row.append(t) kappa_str = '<strong>Kappa coefficient</strong>' para = p.Paragraph(kappa_str, styles['body_style_10']) header_row.append(para) species_table.append(header_row) # Open the species accuracy file into a recarray spp_data = utilities.csv2rec(self.species_accuracy_file) # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Read in the report metadata if it exists if self.report_metadata_file: rmp = xrmp.XMLReportMetadataParser(self.report_metadata_file) else: rmp = None # Subset the attributes to just species attrs = [] for attr in mp.attributes: if attr.species_attr == 1 and 'NOTALY' not in attr.field_name: attrs.append(attr.field_name) # Iterate over the species and print out the statistics for spp in attrs: # Empty row to hold the formatted output species_row = [] # Get the scientific and common names from the report metadata # if it exists; otherwise, just use the species symbol if rmp is not None: # Strip off any suffix if it exists try: spp_plain = spp.split('_')[0] spp_info = rmp.get_species(spp_plain) spp_str = spp_info.spp_symbol + '<br/>' spp_str += spp_info.scientific_name + ' / ' spp_str += spp_info.common_name except IndexError: spp_str = spp else: spp_str = spp para = p.Paragraph(spp_str, styles['body_style_10']) species_row.append(para) # Get the statistical information data = spp_data[spp_data.SPECIES == spp][0] counts = [data.OP_PP, data.OP_PA, data.OA_PP, data.OA_PA] prevalence = data.PREVALENCE kappa = data.KAPPA # Species prevalence prevalence_str = '%.4f' % prevalence para = p.Paragraph(prevalence_str, styles['body_style_10_right']) species_row.append(para) # Capture the plot counts in an inner table count_cells = [] count_row = [] for i in range(0, 4): para = p.Paragraph( '%d' % counts[i], styles['body_style_10_right']) count_row.append(para) if i % 2 == 1: count_cells.append(count_row) count_row = [] t = p.Table(count_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch]) t.setStyle( p.TableStyle([ ('GRID', (0, 0), (-1, -1), 1, colors.white), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ])) species_row.append(t) # Print out the kappa statistic kappa_str = '%.4f' % kappa para = p.Paragraph(kappa_str, styles['body_style_10_right']) species_row.append(para) # Push this row to the master species table species_table.append(species_row) # Style this into a reportlab table and add to the story col_widths = [(x * u.inch) for x in [4.0, 0.75, 1.5, 0.75]] t = p.Table(species_table, colWidths=col_widths) t.setStyle( p.TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), '#f1efe4'), ('GRID', (0, 0), (-1, -1), 2, colors.white), ('TOPPADDING', (0, 0), (0, -1), 2), ('BOTTOMPADDING', (0, 0), (0, -1), 2), ('LEFTPADDING', (0, 0), (0, -1), 6), ('RIGHTPADDING', (0, 0), (0, -1), 6), ('ALIGNMENT', (0, 0), (0, -1), 'LEFT'), ('VALIGN', (0, 0), (0, -1), 'TOP'), ('TOPPADDING', (1, 0), (1, -1), 2), ('BOTTOMPADDING', (1, 0), (1, -1), 2), ('LEFTPADDING', (1, 0), (1, -1), 6), ('RIGHTPADDING', (1, 0), (1, -1), 6), ('ALIGNMENT', (1, 0), (1, -1), 'RIGHT'), ('VALIGN', (1, 0), (1, 0), 'TOP'), ('VALIGN', (1, 1), (1, -1), 'MIDDLE'), ('TOPPADDING', (2, 0), (2, -1), 0), ('BOTTOMPADDING', (2, 0), (2, -1), 0), ('LEFTPADDING', (2, 0), (2, -1), 0), ('RIGHTPADDING', (2, 0), (2, -1), 0), ('ALIGNMENT', (2, 0), (2, -1), 'LEFT'), ('VALIGN', (2, 0), (2, -1), 'TOP'), ('TOPPADDING', (3, 0), (3, -1), 2), ('BOTTOMPADDING', (3, 0), (3, -1), 2), ('LEFTPADDING', (3, 0), (3, -1), 6), ('RIGHTPADDING', (3, 0), (3, -1), 6), ('ALIGNMENT', (3, 0), (3, -1), 'RIGHT'), ('VALIGN', (3, 0), (3, 0), 'TOP'), ('VALIGN', (3, 1), (3, -1), 'MIDDLE'), ])) story.append(t) story.append(p.Spacer(0, 0.1 * u.inch)) rare_species_str = """ Note that some very rare species do not appear in this accuracy report, because these species were not included when building the initial ordination model. The full set of species is available upon request. """ para = p.Paragraph(rare_species_str, styles['body_style']) story.append(para) # Return this story return story
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict( (getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [ x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1 ] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join( (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)
def calculate_neighbors_at_ids(self, id_x_year, id_field='FCID'): """ Run ordination model over the list of IDs sent in and return neighbors and distances for each plot Parameters ---------- id_x_year : dict Dictionary of plot IDs to associated imagery year to know what year to run the model id_field : str Name of the ID field - should be either 'FCID' or 'PLTID'. Defaults to 'FCID' Returns ------- None (neighbor data stored as self attribute) """ # Alias for self.parameter_parser p = self.parameter_parser # Ensure the parameter parser is not a PROTOTYPE if p.parameter_set not in ('FULL', 'MINIMUM'): err_msg = 'Parameter set must be "FULL" or "MINIMUM"' raise ValueError(err_msg) # Get footprint file fp_file = p.footprint_file # Check ID field if id_field not in ('FCID', 'PLTID'): err_msg = id_field + ' accuracy assessment is not currently ' err_msg += 'supported' raise NotImplementedError(err_msg) # Get a list of the unique IDs ids = np.unique(id_x_year.keys()) # Get a list of the years over which we need to run models years = np.unique(id_x_year.values()) # Create a dictionary of all plots associated with each model year year_ids = {} for (k, v) in id_x_year.iteritems(): try: year_ids[v].append(k) except KeyError: year_ids[v] = [k] # This section extracts the ordination variable information from the # model XML files and creates a dict of year/variable combinations. # Once this dict is created, we only need to extract the spatial data # from the unique set of values in this dict and use this crosswalk # to get to those values. This should be efficient from GDAL's # perspective to avoid cache thrashing. # # However, because we don't need all ordination variable's values for # all plots (ie. temporally varying ordination variables), at this # point we only want to extract footprints for those variables that are # common across all years. We track the count of times a variable # appears across all lists (raster_counts) and if equal to # len(years), we extract footprints at this point. # # For all other variables, we wait until we have a subset of the coords # to extract the spatial data ord_year_var_dict = {} raster_counts = {} raster_dict = {} for year in years: ord_year_var_dict[year] = {} # Get the ordination variables specialized for this year ord_vars = p.get_ordination_variables(year) for (var, path) in ord_vars: # For this year, variable combination, store the path to the # variable ord_year_var_dict[year][var] = path # Record this variable in the counts and push to the raster # list if it's a new variable try: raster_counts[path] += 1 except KeyError: ds = gdal.Open(path, gdalconst.GA_ReadOnly) raster_dict[path] = [ds, False] raster_counts[path] = 1 # Retrieve all coordinates records as a recarray coords = utilities.csv2rec(p.coordinate_file) # Subset this list to just those plots in the model id_arr = getattr(coords, id_field) coord_list = coords[np.in1d(id_arr, ids)] # Retrieve the footprint configurations. Footprint offsets store the # row and column tuples of each pixel within a given footprint. # Footprint windows store the upper left coordinate and window size for # extraction from GDAL datasets fp_parser = footprint.FootprintParser() fp_dict = fp_parser.parse(fp_file) fp_offsets = {} fp_windows = {} for (id, data_source, x, y) in coord_list: fp_offsets[id] = fp_dict[data_source].offsets fp_windows[id] = fp_dict[data_source].window((x, y)) # Extract footprint information for every ordination variable that is # common to all years and store in a dict keyed by ID and raster # file name fp_value_dict = {} for (fn, count) in raster_counts.iteritems(): if count == len(years): print fn ds, processed = raster_dict[fn] # Get the footprint window values for this dataset fp_values = self.get_footprint_values(ds, fp_windows) # Change the flag for this dataset to 'processed' raster_dict[fn][1] = True # Store these footprint values in a dictionary keyed by # id and variable file name for (id, fp) in fp_values.iteritems(): try: fp_value_dict[id][fn] = fp except KeyError: fp_value_dict[id] = {} fp_value_dict[id][fn] = fp # Close this dataset - no longer needed raster_dict[fn][0] = None # Get the ordination model and read it in ord_file = p.get_ordination_file() lop = lemma_ordination_parser.LemmaOrdinationParser() ord_model = lop.parse(ord_file, delimiter=',') # Create the imputation model based on the ordination model and the # imputation parameters imp_model = im.ImputationModel(ord_model, n_axes=p.number_axes, use_weightings=p.use_axis_weighting, max_neighbors=p.max_neighbors) # Main loop to iterate over all years for year in years: print year # Get the subset of footprint offsets and windows for this year offsets = dict((x, fp_offsets[x]) for x in year_ids[year]) windows = dict((x, fp_windows[x]) for x in year_ids[year]) # Extract footprints for any variables that are not common to all # years, but specialized for this year for (var, fn) in ord_year_var_dict[year].iteritems(): ds, processed = raster_dict[fn] if not processed: print fn # Extract footprint values for this dataset fp_values = self.get_footprint_values(ds, windows) # Set the processed flag to True raster_dict[fn][1] = True # Store these values for (id, fp) in fp_values.iteritems(): try: fp_value_dict[id][fn] = fp except: fp_value_dict[id] = {} fp_value_dict[id][fn] = fp # Close the dataset - no longer needed raster_dict[fn][0] = None # At this point, we have all the footprint information needed for # this year stored in fp_value_dict. Now, iterate over each plot # in this year and run the imputation for each pixel. Output is # captured at the pixel scale (within zonal_pixel_dict) and # for each attribute at the plot scale (within predicted_dict). for id in sorted(windows.keys()): # Get the footprint values for this plot fp_values = [] for var in ord_model.var_names: fn = ord_year_var_dict[year][var] fp_values.append(fp_value_dict[id][fn]) # Set up an output instance to capture each pixel's neighbors # and distances obj = NNFootprint(id) # Run the imputation for each pixel in the footprint for o in offsets[id]: # Get the ordination variable values for this offset # Store in (1xv) array v = np.array(self.get_values_from_offset(fp_values, o)) v = v[np.newaxis, :] # Run the imputation nn_ids, nn_dists = imp_model.get_neighbors(v, id=id) # Append this pixel to the NNFootprint object obj.append(NNPixel(nn_ids, nn_dists)) # Store the neighbor information self.neighbor_data[id] = copy.deepcopy(obj)
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = 'Species and environment plot IDs do not match' raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == 'SQRT': spp = np.sqrt(spp) elif self.species_transform == 'LOG': spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, 'w') # Eigenvalues numpy_fh.write('### Eigenvalues ###\n') for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write('RDA' + str(i + 1) + ',' + '%.10f' % e + '\n') numpy_fh.write('\n') # Print out variable means numpy_fh.write('### Variable Means ###\n') for (i, m) in enumerate(cca.env_means): numpy_fh.write('%s,%.10f\n' % (self.variables[i], m)) numpy_fh.write('\n') # Print out environmental coefficients loadings numpy_fh.write('### Coefficient Loadings ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, c) in enumerate(cca.coefficients()): coeff = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (self.variables[i], coeff)) numpy_fh.write('\n') # Print out biplot scores numpy_fh.write('### Biplot Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, b) in enumerate(cca.biplot_scores()): scores = ','.join(['%.10f' % x for x in b]) numpy_fh.write('%s,%s\n' % (self.variables[i], scores)) numpy_fh.write('\n') # Print out species centroids numpy_fh.write('### Species Centroids ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, c) in enumerate(cca.species_centroids()): scores = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out species tolerances numpy_fh.write('### Species Tolerances ###\n') header_str = \ ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, t) in enumerate(cca.species_tolerances()): scores = ','.join(['%.21f' % x for x in t]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out miscellaneous species information numpy_fh.write('### Miscellaneous Species Information ###\n') numpy_fh.write('SPECIES,WEIGHT,N2\n') species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write( '%s,%.10f,%.10f\n' % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write('\n') # Print out site LC scores numpy_fh.write('### Site LC Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_lc_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Print out site WA scores numpy_fh.write('### Site WA Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_wa_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Miscellaneous site information numpy_fh.write('### Miscellaneous Site Information ###\n') numpy_fh.write('ID,WEIGHT,N2\n') site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write('%s,%.10f,%.10f\n' % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def run_diagnostic(self): # Open the stats file and print out the header line stats_fh = open(self.statistics_file, 'w') out_list = [ 'VARIABLE', 'PEARSON_R', 'SPEARMAN_R', 'RMSE', 'NORMALIZED_RMSE', 'BIAS_PERCENTAGE', 'R_SQUARE', ] stats_fh.write(','.join(out_list) + '\n') # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d( getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # For each variable, calculate the statistics for v in obs.dtype.names: # Get the metadata for this field try: fm = mp.get_attribute(v) except: err_msg = v + ' is missing metadata.' print err_msg continue # Only continue if this is a continuous accuracy variable if fm.field_type != 'CONTINUOUS' or fm.accuracy_attr == 0: continue obs_vals = getattr(obs, v) prd_vals = getattr(prd, v) if np.all(obs_vals == 0.0): pearson_r = 0.0 spearman_r = 0.0 rmse = 0.0 std_rmse = 0.0 bias = 0.0 r2 = 0.0 else: if np.all(prd_vals == 0.0): pearson_r = 0.0 spearman_r = 0.0 else: pearson_r = statistics.pearson_r(obs_vals, prd_vals) spearman_r = statistics.spearman_r(obs_vals, prd_vals) rmse = statistics.rmse(obs_vals, prd_vals) std_rmse = rmse / obs_vals.mean() bias = statistics.bias_percentage(obs_vals, prd_vals) r2 = statistics.r2(obs_vals, prd_vals) # Print this out to the stats file out_list = [ v, '%.6f' % pearson_r, '%.6f' % spearman_r, '%.6f' % rmse, '%.6f' % std_rmse, '%.6f' % bias, '%.6f' % r2, ] stats_fh.write(','.join(out_list) + '\n') stats_fh.close()
def run_diagnostic(self): # Open the stats file and print out the header line stats_fh = open(self.statistics_file, 'w') out_list = [ 'VARIABLE', 'PEARSON_R', 'SPEARMAN_R', 'RMSE', 'NORMALIZED_RMSE', 'BIAS_PERCENTAGE', 'R_SQUARE', ] stats_fh.write(','.join(out_list) + '\n') # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # For each variable, calculate the statistics for v in obs.dtype.names: # Get the metadata for this field try: fm = mp.get_attribute(v) except: err_msg = v + ' is missing metadata.' print err_msg continue # Only continue if this is a continuous accuracy variable if fm.field_type != 'CONTINUOUS' or fm.accuracy_attr == 0: continue obs_vals = getattr(obs, v) prd_vals = getattr(prd, v) if np.all(obs_vals == 0.0): pearson_r = 0.0 spearman_r = 0.0 rmse = 0.0 std_rmse = 0.0 bias = 0.0 r2 = 0.0 else: if np.all(prd_vals == 0.0): pearson_r = 0.0 spearman_r = 0.0 else: pearson_r = statistics.pearson_r(obs_vals, prd_vals) spearman_r = statistics.spearman_r(obs_vals, prd_vals) rmse = statistics.rmse(obs_vals, prd_vals) std_rmse = rmse / obs_vals.mean() bias = statistics.bias_percentage(obs_vals, prd_vals) r2 = statistics.r2(obs_vals, prd_vals) # Print this out to the stats file out_list = [ v, '%.6f' % pearson_r, '%.6f' % spearman_r, '%.6f' % rmse, '%.6f' % std_rmse, '%.6f' % bias, '%.6f' % r2, ] stats_fh.write(','.join(out_list) + '\n') stats_fh.close()
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join(( prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join(( prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison( prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)
def _create_story(self): # Set up an empty list to hold the story story = [] # Import the report styles styles = report_styles.get_report_styles() # Create a page break story = self._make_page_break(story, self.PORTRAIT) # Section title title_str = '<strong>Local-Scale Accuracy Assessment:<br/>' title_str += 'Species Accuracy at Plot Locations' title_str += '</strong>' para = p.Paragraph(title_str, styles['section_style']) t = p.Table([[para]], colWidths=[7.5 * u.inch]) t.setStyle( p.TableStyle([ ('TOPPADDING', (0, 0), (-1, -1), 6), ('BOTTOMPADDING', (0, 0), (-1, -1), 6), ('BACKGROUND', (0, 0), (-1, -1), '#957348'), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('GRID', (0, 0), (-1, -1), 0.25, colors.black), ])) story.append(t) story.append(p.Spacer(0, 0.2 * u.inch)) # Kappa explanation kappa_str = ''' Cohen's kappa coefficient (Cohen, 1960) is a statistical measure of reliability, accounting for agreement occurring by chance. The equation for kappa is: ''' para = p.Paragraph(kappa_str, styles['body_style']) story.append(para) story.append(p.Spacer(0, 0.05 * u.inch)) kappa_str = ''' kappa = (Pr(a) - Pr(e)) / (1.0 - Pr(e)) ''' para = p.Paragraph(kappa_str, styles['indented']) story.append(para) story.append(p.Spacer(0, 0.05 * u.inch)) kappa_str = ''' where Pr(a) is the relative observed agreement among raters, and Pr(e) is the probability that agreement is due to chance.<br/><br/> <strong>Abbreviations Used:</strong><br/> OP/PP = Observed Present / Predicted Present<br/> OA/PP = Observed Absent / Predicted Present (errors of commission)<br/> OP/PA = Observed Present / Predicted Absent (errors of ommission)<br/> OA/PA = Observed Absent / Predicted Absent ''' para = p.Paragraph(kappa_str, styles['body_style']) story.append(para) story.append(p.Spacer(0, 0.2 * u.inch)) # Create a list of lists to hold the species accuracy information species_table = [] # Header row header_row = [] spp_str = '<strong>Species PLANTS Code<br/>' spp_str += 'Scientific Name / Common Name</strong>' para = p.Paragraph(spp_str, styles['body_style_10']) header_row.append(para) spp_str = '<strong>Species prevalence</strong>' para = p.Paragraph(spp_str, styles['body_style_10']) header_row.append(para) p1 = p.Paragraph('<strong>OP/PP</strong>', styles['body_style_10_right']) p2 = p.Paragraph('<strong>OP/PA</strong>', styles['body_style_10_right']) p3 = p.Paragraph('<strong>OA/PP</strong>', styles['body_style_10_right']) p4 = p.Paragraph('<strong>OA/PA</strong>', styles['body_style_10_right']) header_cells = [[p1, p2], [p3, p4]] t = p.Table(header_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch]) t.setStyle( p.TableStyle([ ('GRID', (0, 0), (-1, -1), 1, colors.white), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ])) header_row.append(t) kappa_str = '<strong>Kappa coefficient</strong>' para = p.Paragraph(kappa_str, styles['body_style_10']) header_row.append(para) species_table.append(header_row) # Open the species accuracy file into a recarray spp_data = utilities.csv2rec(self.species_accuracy_file) # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Read in the report metadata if it exists if self.report_metadata_file: rmp = xrmp.XMLReportMetadataParser(self.report_metadata_file) else: rmp = None # Subset the attributes to just species attrs = [] for attr in mp.attributes: if attr.species_attr == 1 and 'NOTALY' not in attr.field_name: attrs.append(attr.field_name) # Iterate over the species and print out the statistics for spp in attrs: # Empty row to hold the formatted output species_row = [] # Get the scientific and common names from the report metadata # if it exists; otherwise, just use the species symbol if rmp is not None: # Strip off any suffix if it exists try: spp_plain = spp.split('_')[0] spp_info = rmp.get_species(spp_plain) spp_str = spp_info.spp_symbol + '<br/>' spp_str += spp_info.scientific_name + ' / ' spp_str += spp_info.common_name except IndexError: spp_str = spp else: spp_str = spp para = p.Paragraph(spp_str, styles['body_style_10']) species_row.append(para) # Get the statistical information data = spp_data[spp_data.SPECIES == spp][0] counts = [data.OP_PP, data.OP_PA, data.OA_PP, data.OA_PA] prevalence = data.PREVALENCE kappa = data.KAPPA # Species prevalence prevalence_str = '%.4f' % prevalence para = p.Paragraph(prevalence_str, styles['body_style_10_right']) species_row.append(para) # Capture the plot counts in an inner table count_cells = [] count_row = [] for i in range(0, 4): para = p.Paragraph('%d' % counts[i], styles['body_style_10_right']) count_row.append(para) if i % 2 == 1: count_cells.append(count_row) count_row = [] t = p.Table(count_cells, colWidths=[0.75 * u.inch, 0.75 * u.inch]) t.setStyle( p.TableStyle([ ('GRID', (0, 0), (-1, -1), 1, colors.white), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ])) species_row.append(t) # Print out the kappa statistic kappa_str = '%.4f' % kappa para = p.Paragraph(kappa_str, styles['body_style_10_right']) species_row.append(para) # Push this row to the master species table species_table.append(species_row) # Style this into a reportlab table and add to the story col_widths = [(x * u.inch) for x in [4.0, 0.75, 1.5, 0.75]] t = p.Table(species_table, colWidths=col_widths) t.setStyle( p.TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), '#f1efe4'), ('GRID', (0, 0), (-1, -1), 2, colors.white), ('TOPPADDING', (0, 0), (0, -1), 2), ('BOTTOMPADDING', (0, 0), (0, -1), 2), ('LEFTPADDING', (0, 0), (0, -1), 6), ('RIGHTPADDING', (0, 0), (0, -1), 6), ('ALIGNMENT', (0, 0), (0, -1), 'LEFT'), ('VALIGN', (0, 0), (0, -1), 'TOP'), ('TOPPADDING', (1, 0), (1, -1), 2), ('BOTTOMPADDING', (1, 0), (1, -1), 2), ('LEFTPADDING', (1, 0), (1, -1), 6), ('RIGHTPADDING', (1, 0), (1, -1), 6), ('ALIGNMENT', (1, 0), (1, -1), 'RIGHT'), ('VALIGN', (1, 0), (1, 0), 'TOP'), ('VALIGN', (1, 1), (1, -1), 'MIDDLE'), ('TOPPADDING', (2, 0), (2, -1), 0), ('BOTTOMPADDING', (2, 0), (2, -1), 0), ('LEFTPADDING', (2, 0), (2, -1), 0), ('RIGHTPADDING', (2, 0), (2, -1), 0), ('ALIGNMENT', (2, 0), (2, -1), 'LEFT'), ('VALIGN', (2, 0), (2, -1), 'TOP'), ('TOPPADDING', (3, 0), (3, -1), 2), ('BOTTOMPADDING', (3, 0), (3, -1), 2), ('LEFTPADDING', (3, 0), (3, -1), 6), ('RIGHTPADDING', (3, 0), (3, -1), 6), ('ALIGNMENT', (3, 0), (3, -1), 'RIGHT'), ('VALIGN', (3, 0), (3, 0), 'TOP'), ('VALIGN', (3, 1), (3, -1), 'MIDDLE'), ])) story.append(t) story.append(p.Spacer(0, 0.1 * u.inch)) rare_species_str = """ Note that some very rare species do not appear in this accuracy report, because these species were not included when building the initial ordination model. The full set of species is available upon request. """ para = p.Paragraph(rare_species_str, styles['body_style']) story.append(para) # Return this story return story
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = "Species and environment plot IDs do not match" raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == "SQRT": spp = np.sqrt(spp) elif self.species_transform == "LOG": spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, "w") # Eigenvalues numpy_fh.write("### Eigenvalues ###\n") for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write("RDA" + str(i + 1) + "," + "%.10f" % e + "\n") numpy_fh.write("\n") # Print out variable means numpy_fh.write("### Variable Means ###\n") for (i, m) in enumerate(cca.env_means): numpy_fh.write("%s,%.10f\n" % (self.variables[i], m)) numpy_fh.write("\n") # Print out environmental coefficients loadings numpy_fh.write("### Coefficient Loadings ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, c) in enumerate(cca.coefficients()): coeff = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (self.variables[i], coeff)) numpy_fh.write("\n") # Print out biplot scores numpy_fh.write("### Biplot Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, b) in enumerate(cca.biplot_scores()): scores = ",".join(["%.10f" % x for x in b]) numpy_fh.write("%s,%s\n" % (self.variables[i], scores)) numpy_fh.write("\n") # Print out species centroids numpy_fh.write("### Species Centroids ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, c) in enumerate(cca.species_centroids()): scores = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out species tolerances numpy_fh.write("### Species Tolerances ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, t) in enumerate(cca.species_tolerances()): scores = ",".join(["%.21f" % x for x in t]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out miscellaneous species information numpy_fh.write("### Miscellaneous Species Information ###\n") numpy_fh.write("SPECIES,WEIGHT,N2\n") species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write("\n") # Print out site LC scores numpy_fh.write("### Site LC Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_lc_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Print out site WA scores numpy_fh.write("### Site WA Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_wa_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Miscellaneous site information numpy_fh.write("### Miscellaneous Site Information ###\n") numpy_fh.write("ID,WEIGHT,N2\n") site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def _read_id_list_file(self, id_list_file): data = utilities.csv2rec(id_list_file) return ','.join([str(x[0]) for x in data])
def run_diagnostic(self): # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d(getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Open the stats file and print out the header lines stats_fh = open(self.statistics_file, 'w') out_list = [ 'SPECIES', 'OP_PP', 'OP_PA', 'OA_PP', 'OA_PA', 'PREVALENCE', 'SENSITIVITY', 'FALSE_NEGATIVE_RATE', 'SPECIFICITY', 'FALSE_POSITIVE_RATE', 'PERCENT_CORRECT', 'ODDS_RATIO', 'KAPPA', ] stats_fh.write(','.join(out_list) + '\n') # For each variable, calculate the statistics for v in obs.dtype.names: # Get the metadata for this field try: fm = mp.get_attribute(v) except: err_msg = v + ' is missing metadata.' print err_msg continue # Only continue if this is a continuous species variable if fm.field_type != 'CONTINUOUS' or fm.species_attr == 0: continue obs_vals = getattr(obs, v) prd_vals = getattr(prd, v) # Create a binary error matrix from the obs and prd data stats = statistics.BinaryErrorMatrix(obs_vals, prd_vals) counts = stats.counts() # Build the list of items for printing out_list = [ v, '%d' % counts[0, 0], '%d' % counts[0, 1], '%d' % counts[1, 0], '%d' % counts[1, 1], '%.4f' % stats.prevalence(), '%.4f' % stats.sensitivity(), '%.4f' % stats.false_negative_rate(), '%.4f' % stats.specificity(), '%.4f' % stats.false_positive_rate(), '%.4f' % stats.percent_correct(), '%.4f' % stats.odds_ratio(), '%.4f' % stats.kappa(), ] stats_fh.write(','.join(out_list) + '\n') stats_fh.close()
def run_diagnostic(self): # Read the observed and predicted files into numpy recarrays obs = utilities.csv2rec(self.observed_file) prd = utilities.csv2rec(self.predicted_file) # Subset the observed data just to the IDs that are in the # predicted file obs_keep = np.in1d( getattr(obs, self.id_field), getattr(prd, self.id_field)) obs = obs[obs_keep] # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Open the stats file and print out the header lines stats_fh = open(self.statistics_file, 'w') out_list = [ 'SPECIES', 'OP_PP', 'OP_PA', 'OA_PP', 'OA_PA', 'PREVALENCE', 'SENSITIVITY', 'FALSE_NEGATIVE_RATE', 'SPECIFICITY', 'FALSE_POSITIVE_RATE', 'PERCENT_CORRECT', 'ODDS_RATIO', 'KAPPA', ] stats_fh.write(','.join(out_list) + '\n') # For each variable, calculate the statistics for v in obs.dtype.names: # Get the metadata for this field try: fm = mp.get_attribute(v) except: err_msg = v + ' is missing metadata.' print err_msg continue # Only continue if this is a continuous species variable if fm.field_type != 'CONTINUOUS' or fm.species_attr == 0: continue obs_vals = getattr(obs, v) prd_vals = getattr(prd, v) # Create a binary error matrix from the obs and prd data stats = statistics.BinaryErrorMatrix(obs_vals, prd_vals) counts = stats.counts() # Build the list of items for printing out_list = [ v, '%d' % counts[0, 0], '%d' % counts[0, 1], '%d' % counts[1, 0], '%d' % counts[1, 1], '%.4f' % stats.prevalence(), '%.4f' % stats.sensitivity(), '%.4f' % stats.false_negative_rate(), '%.4f' % stats.specificity(), '%.4f' % stats.false_positive_rate(), '%.4f' % stats.percent_correct(), '%.4f' % stats.odds_ratio(), '%.4f' % stats.kappa(), ] stats_fh.write(','.join(out_list) + '\n') stats_fh.close()