def raw(h5, files, fields=[]): """ Take list of .fits files and store them in the raw group fields - list of fields to keep. Use a subset for smaller file size. """ raw = h5.create_group('/raw') hduL = [] kicL = [] qL = [] for f in files: h = fits.open(f) hduL += [h] kicL += [h[0].header['KEPLERID']] qL += [h[0].header['QUARTER']] assert np.unique(kicL).size == 1, 'KEPLERID not the same' assert np.unique(qL).size == len(qL), 'duplicate quarters' h5.attrs['KEPLERID'] = kicL[0] for h, q in zip(hduL, qL): r = np.array(h[1].data) r = modcols(r) raw['Q%i' % q] = r if fields != []: r = mlab.rec_keep_fields(r, fields)
def load_aeronet(fname, keep_fields='all', header=False): """loads aeronet lev 2.0 csv file. fname: data file name keep_fields: 'all' or a list of fields header: whether to return header information along with the data. """ std_day = datetime(1900,1,1,0,0,0) def date2daynum(datestr): the_day = datetime.strptime(datestr, '%d:%m:%Y') return float((the_day - std_day).days) def time2seconds(timestr): h, m, s = [int(t) for t in timestr.split(':')] return float(h * 3600 + m * 60 + s) def daynum_seconds2datetime(daynum, seconds): return std_day + timedelta(days=int(daynum), seconds=int(seconds)) headlines = [] f = open(fname, 'r') for line_i, line in enumerate(f): line = line.rstrip() if line.startswith('Date(dd-mm-yy'): datefield, timefield = [re.sub(r'\W', '', tk) for tk in line.split(',')[0:2]] break headlines.append(line) skip_header_lines = line_i if header: headline = ','.join(headlines) headerd = dict() for attrname, converter in [('location', str), ('long', float), ('lat', float), ('elev', float), ('nmeas', int), ('PI', str), ('email', str)]: m = re.search(r'%s.{0,1}=([^,\s]*)' % attrname, headline, flags=re.I) if m: try: headerd[attrname] = converter(m.group(1)) except Exception: pass rawd = np.genfromtxt(fname, skip_header=skip_header_lines, delimiter=',', names=True, converters={0:date2daynum, 1:time2seconds}) lend = len(rawd) dates = np.zeros(len(rawd), dtype='O') for i in range(lend): dates[i] = daynum_seconds2datetime(rawd[datefield][i], rawd[timefield][i]) newd = mlab.rec_append_fields(rawd, 'datetime', dates) newd = mlab.rec_drop_fields(newd, [datefield, timefield, 'Last_Processing_Date']) if keep_fields is not 'all': keep_fields = ['datetime'] + keep_fields # print keep_fields newd = mlab.rec_keep_fields(newd, keep_fields) if header: return newd, headerd else: return newd
def qdt(r): """ Small wrapper around rdt that removes duplicate names """ rawFields = list(r.dtype.names) r = rdt(r) dtFields = list(r.dtype.names) dtFields = [f for f in dtFields if rawFields.count(f) == 0] return mlab.rec_keep_fields(r, dtFields)
def interesting_out(opts,interesting,data): """ Take a list of fields, and the recs output recs as csv to opts["out"], e.g. --out """ header = True from matplotlib import mlab for d in data: cleaned = mlab.rec_keep_fields(d,interesting) mlab.rec2csv(cleaned,opts["out"],withheader=header) header=False
def interesting_out(opts, interesting, data): """ Take a list of fields, and the recs output recs as csv to opts["out"], e.g. --out """ header = True from matplotlib import mlab for d in data: cleaned = mlab.rec_keep_fields(d, interesting) mlab.rec2csv(cleaned, opts["out"], withheader=header) header = False
def atpy2h5(files, out, diff='all', name='ds'): """ atpy format to h5 Parameters ---------- inp : globable string specifying where the input files are out : output h5 file. In none exists, we create it. diff : List of fields that are stored as stacked arrays. Those that are not different, we store the first element. """ nfiles = len(files) t0 = atpy.Table(files[0]) h5 = File(out) ds, ds1d = diffDS(t0.table_name, t0.data.dtype, (nfiles, t0.data.size), h5, diff=diff) kicL = [] nFail = 0 # import pdb;pdb.set_trace() for i in range(nfiles): if np.mod(i, 100) == 0: print i try: hdu = pyfits.open(files[i]) data = hdu[1].data kic = hdu[1].header['KEPLERID'] assert type(kic) == int kicL.append(kic) if diff != 'all': data = mlab.rec_keep_fields(data, diff) ds1d[:] = mlab.rec_drop_fields(data, diff) ds[i - nFail] = data except: print sys.exc_info()[1] nFail += 1 ds.resize(ds.shape[0] - nFail, axis=0) kicL = np.array(kicL) h5.create_dataset('KIC', data=kicL) print "%i files failed" % nFail h5.close()
def atpy2h5(files,out,diff='all',name='ds'): """ atpy format to h5 Parameters ---------- inp : globable string specifying where the input files are out : output h5 file. In none exists, we create it. diff : List of fields that are stored as stacked arrays. Those that are not different, we store the first element. """ nfiles = len(files) t0 = atpy.Table(files[0]) h5 = File(out) ds,ds1d = diffDS(t0.table_name,t0.data.dtype,(nfiles,t0.data.size) ,h5,diff=diff) kicL = [] nFail = 0 # import pdb;pdb.set_trace() for i in range(nfiles): if np.mod(i,100)==0: print i try: hdu = pyfits.open(files[i]) data = hdu[1].data kic = hdu[1].header['KEPLERID'] assert type(kic) == int kicL.append(kic) if diff!='all': data = mlab.rec_keep_fields(data,diff) ds1d[:] = mlab.rec_drop_fields(data,diff) ds[i-nFail] = data except: print sys.exc_info()[1] nFail +=1 ds.resize(ds.shape[0]-nFail,axis=0) kicL = np.array(kicL) h5.create_dataset('KIC',data=kicL) print "%i files failed" % nFail h5.close()
def create_mag_table(self, outputPath, isocType="pdva", specType="basel"): """Create an HDF5 table of that describes a set of magnitudes.""" if os.path.exists(outputPath): os.remove(outputPath) title = os.path.splitext(os.path.basename(outputPath))[0] h5file = tables.openFile(outputPath, mode="w", title=title) table = h5file.createTable("/", 'mags', MagTableDef, "Mag Model Table") print h5file docs = self.collection.find({"compute_complete":True, "np_data": {"$exists": 1}}) # , limit=2 print "working on %i docs to read" % docs.count() lut = get_metallicity_LUT(isocType, specType) for doc in docs: print "reading", doc['_id'] # print doc.keys() # print doc['np_data'] npData = doc['np_data'] # print npData.dtype # binData = Binary(doc['np_data']['data']) # print type(binData) # npData = pickle.load(binData) nRows = len(npData) # Append model information (about SFH, dust, etc) zmet = doc['pset']['zmet'] Z = lut[zmet-1] zmets = np.ones(nRows, dtype=np.float) * Z tau = doc['pset']['tau'] taus = np.ones(nRows, dtype=np.float) * tau npDataAll = mlab.rec_append_fields(npData, ['Z','tau'],[zmets,taus]) # Trim the recarray to just the desired fields npDataTrim = mlab.rec_keep_fields(npDataAll, ['Z','tau','age','mass','lbol','sfr','TMASS_J','TMASS_H', 'TMASS_Ks','MegaCam_u','MegaCam_g','MegaCam_r','MegaCam_i', 'MegaCam_z','GALEX_NUV','GALEX_FUV']) for i in xrange(nRows): row = npDataTrim[i] print row['Z'], row['tau'],row['TMASS_J'],row['TMASS_Ks'] # Append to HDF5 table.append(npDataTrim) h5file.flush() h5file.close()
def match_files(pattern, query): """ Return list of file names of CSV files that satisfy query conditions, where pattern ... a file name pattern for files that are to be tested query ... a list of key=value pairs A file is a match if any row contains entries for all query conditions where each entry in the column labeled with key is identical to the corresponding value. Specify query as a dictionary in one of these forms: dict(k1=v1, k2=v2, k3=v3, ...) dict({"k1":v1, "k2":v2, "k3":v3, ...}) {"k1":v1, "k2":v2, "k3":v3, ...} """ # get all file names that match pattern infiles = glob.glob(pattern) infiles.sort() mlist = [] # determine the query keys (in lowercase because csv2rec lowercases headers) # and query values qkeys = query.keys() qlckeys = [x.lower() for x in qkeys] qvalues = query.values() # check files for patterns for f in infiles: try: d = mlab.csv2rec(f, delimiter='\t') except ValueError: print(str(f) + " cannot be read by csv2rec") else: # check if the data contain the necessary columns if set(qlckeys) <= set(d.dtype.names): darray = mlab.rec_keep_fields(d, qlckeys) for row in darray: if list(row) == qvalues: # a match has been found mlist.append(f) break # return a list of file names for files with matches return mlist
def match_files(pattern, query): """ Return list of file names of CSV files that satisfy query conditions, where pattern ... a file name pattern for files that are to be tested query ... a list of key=value pairs A file is a match if any row contains entries for all query conditions where each entry in the column labeled with key is identical to the corresponding value. Specify query as a dictionary in one of these forms: dict(k1=v1, k2=v2, k3=v3, ...) dict({"k1":v1, "k2":v2, "k3":v3, ...}) {"k1":v1, "k2":v2, "k3":v3, ...} """ # get all file names that match pattern infiles = glob.glob(pattern) infiles.sort() mlist = [] # determine the query keys (in lowercase because csv2rec lowercases headers) # and query values qkeys = query.keys() qlckeys = [x.lower() for x in qkeys] qvalues = query.values() # check files for patterns for f in infiles: try: d = mlab.csv2rec(f, delimiter='\t') except ValueError: print(str(f)+" cannot be read by csv2rec") else: # check if the data contain the necessary columns if set(qlckeys) <= set(d.dtype.names): darray = mlab.rec_keep_fields(d, qlckeys) for row in darray: if list(row) == qvalues: # a match has been found mlist.append(f) break # return a list of file names for files with matches return mlist
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = "Species and environment plot IDs do not match" raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == "SQRT": spp = np.sqrt(spp) elif self.species_transform == "LOG": spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, "w") # Eigenvalues numpy_fh.write("### Eigenvalues ###\n") for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write("RDA" + str(i + 1) + "," + "%.10f" % e + "\n") numpy_fh.write("\n") # Print out variable means numpy_fh.write("### Variable Means ###\n") for (i, m) in enumerate(cca.env_means): numpy_fh.write("%s,%.10f\n" % (self.variables[i], m)) numpy_fh.write("\n") # Print out environmental coefficients loadings numpy_fh.write("### Coefficient Loadings ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, c) in enumerate(cca.coefficients()): coeff = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (self.variables[i], coeff)) numpy_fh.write("\n") # Print out biplot scores numpy_fh.write("### Biplot Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, b) in enumerate(cca.biplot_scores()): scores = ",".join(["%.10f" % x for x in b]) numpy_fh.write("%s,%s\n" % (self.variables[i], scores)) numpy_fh.write("\n") # Print out species centroids numpy_fh.write("### Species Centroids ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, c) in enumerate(cca.species_centroids()): scores = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out species tolerances numpy_fh.write("### Species Tolerances ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, t) in enumerate(cca.species_tolerances()): scores = ",".join(["%.21f" % x for x in t]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out miscellaneous species information numpy_fh.write("### Miscellaneous Species Information ###\n") numpy_fh.write("SPECIES,WEIGHT,N2\n") species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write("\n") # Print out site LC scores numpy_fh.write("### Site LC Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_lc_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Print out site WA scores numpy_fh.write("### Site WA Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_wa_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Miscellaneous site information numpy_fh.write("### Miscellaneous Site Information ###\n") numpy_fh.write("ID,WEIGHT,N2\n") site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict((getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join(( prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join(( prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison( prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)
def run_diagnostic(self): # Shortcut to the parameter parser p = self.parameter_parser # ID field id_field = p.summary_level + 'ID' # Root directory for Riemann files root_dir = p.riemann_output_folder # Read in hex input file obs_data = utilities.csv2rec(self.hex_attribute_file) # Get the hexagon levels and ensure that the fields exist in the # hex_attribute file hex_resolutions = p.riemann_hex_resolutions hex_fields = [x[0] for x in hex_resolutions] for field in hex_fields: if field not in obs_data.dtype.names: err_msg = 'Field ' + field + ' does not exist in the ' err_msg += 'hex_attribute file' raise ValueError(err_msg) # Create the directory structure based on the hex levels hex_levels = ['hex_' + str(x[1]) for x in hex_resolutions] all_levels = ['plot_pixel'] + hex_levels for level in all_levels: sub_dir = os.path.join(root_dir, level) if not os.path.exists(sub_dir): os.makedirs(sub_dir) # Get the values of k k_values = p.riemann_k_values # Create a dictionary of plot ID to image year (or model_year for # non-imagery models) for these plots if p.model_type in p.imagery_model_types: id_x_year = dict((x[id_field], x.IMAGE_YEAR) for x in obs_data) else: id_x_year = dict((x[id_field], p.model_year) for x in obs_data) # Create a PredictionRun instance pr = prediction_run.PredictionRun(p) # Get the neighbors and distances for these IDs pr.calculate_neighbors_at_ids(id_x_year, id_field=id_field) # Create the lookup of id_field to LOC_ID for the hex plots nsa_id_dict = dict((x[id_field], x.LOC_ID) for x in obs_data) # Create a dictionary between id_field and no_self_assign_field # for the model plots env_file = p.environmental_matrix_file env_data = utilities.csv2rec(env_file) model_nsa_id_dict = dict( (getattr(x, id_field), x.LOC_ID) for x in env_data) # Stitch the two dictionaries together for id in sorted(model_nsa_id_dict.keys()): if id not in nsa_id_dict: nsa_id_dict[id] = model_nsa_id_dict[id] # Get the stand attribute metadata and retrieve only the # continuous accuracy attributes stand_metadata_file = p.stand_metadata_file mp = xsmp.XMLStandMetadataParser(stand_metadata_file) attrs = [ x.field_name for x in mp.attributes if x.field_type == 'CONTINUOUS' and x.accuracy_attr == 1 ] # Subset the attributes for fields that are in the # hex_attribute file attrs = [x for x in attrs if x in obs_data.dtype.names] plot_pixel_obs = mlab.rec_keep_fields(obs_data, [id_field] + attrs) # Write out the plot_pixel observed file file_name = 'plot_pixel_observed.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) utilities.rec2csv(plot_pixel_obs, output_file) # Iterate over values of k for k in k_values: # Construct the output file name file_name = '_'.join(('plot_pixel', 'predicted', 'k' + str(k))) file_name += '.csv' output_file = os.path.join(root_dir, 'plot_pixel', file_name) out_fh = open(output_file, 'w') # For the plot/pixel scale, retrieve the independent predicted # data for this value of k. Even though attributes are being # returned from this function, we want to use the attribute list # that we've already found above. prediction_generator = pr.calculate_predictions_at_k( k=k, id_field=id_field, independent=True, nsa_id_dict=nsa_id_dict) # Write out the field names out_fh.write(id_field + ',' + ','.join(attrs) + '\n') # Write out the predictions for this k for plot_prediction in prediction_generator: # Write this record to the predicted attribute file pr.write_predicted_record(plot_prediction, out_fh, attrs=attrs) # Close this file out_fh.close() # Create the fields for which to extract statistics at the hexagon # levels mean_fields = [(id_field, len, 'PLOT_COUNT')] mean_fields.extend([(x, np.mean, x) for x in attrs]) mean_fields = tuple(mean_fields) sd_fields = [(id_field, len, 'PLOT_COUNT')] sd_fields.extend([(x, np.std, x) for x in attrs]) sd_fields = tuple(sd_fields) stat_sets = { 'mean': mean_fields, 'std': sd_fields, } # For each hexagon level, associate the plots with their hexagon ID # and find observed and predicted statistics for each hexagon for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] min_plots_per_hex = hex_resolution[3] prefix = 'hex_' + str(hex_distance) # Create a crosswalk between the id_field and the hex_id_field id_x_hex = mlab.rec_keep_fields(obs_data, [id_field, hex_id_field]) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name obs_out_file = \ '_'.join((prefix, 'observed', stat_name)) + '.csv' obs_out_file = os.path.join(root_dir, prefix, obs_out_file) # Write out the observed file self.write_hex_stats(obs_data, hex_id_field, stat_fields, min_plots_per_hex, obs_out_file) # Iterate over values of k for the predicted values for k in k_values: # Open the plot_pixel predicted file for this value of k # and join the hex_id_field to the recarray prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, 'plot_pixel', prd_file) prd_data = utilities.csv2rec(prd_file) prd_data = mlab.rec_join(id_field, prd_data, id_x_hex) # Iterate over all sets of statistics and write a unique file # for each set for (stat_name, stat_fields) in stat_sets.iteritems(): # Get the output file name prd_out_file = '_'.join((prefix, 'predicted', 'k' + str(k), stat_name)) + '.csv' prd_out_file = os.path.join(root_dir, prefix, prd_out_file) # Write out the predicted file self.write_hex_stats(prd_data, hex_id_field, stat_fields, min_plots_per_hex, prd_out_file) # Calculate the ECDF and AC statistics # For ECDF and AC, it is a paired comparison between the observed # and predicted data. We do this at each value of k and for each # hex resolution level. # Open the stats file stats_file = p.hex_statistics_file stats_fh = open(stats_file, 'w') header_fields = ['LEVEL', 'K', 'VARIABLE', 'STATISTIC', 'VALUE'] stats_fh.write(','.join(header_fields) + '\n') # Create a list of RiemannComparison instances which store the # information needed to do comparisons between observed and predicted # files for any level or value of k compare_list = [] for hex_resolution in hex_resolutions: (hex_id_field, hex_distance) = hex_resolution[0:2] prefix = 'hex_' + str(hex_distance) obs_file = '_'.join((prefix, 'observed', 'mean')) + '.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = '_'.join( (prefix, 'predicted', 'k' + str(k), 'mean')) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, hex_id_field, k) compare_list.append(r) # Add the plot_pixel comparisons to this list prefix = 'plot_pixel' obs_file = 'plot_pixel_observed.csv' obs_file = os.path.join(root_dir, prefix, obs_file) for k in k_values: prd_file = 'plot_pixel_predicted_k' + str(k) + '.csv' prd_file = os.path.join(root_dir, prefix, prd_file) r = RiemannComparison(prefix, obs_file, prd_file, id_field, k) compare_list.append(r) # Do all the comparisons for c in compare_list: # Open the observed file obs_data = utilities.csv2rec(c.obs_file) # Open the predicted file prd_data = utilities.csv2rec(c.prd_file) # Ensure that the IDs between the observed and predicted # data line up ids1 = getattr(obs_data, c.id_field) ids2 = getattr(prd_data, c.id_field) if np.all(ids1 != ids2): err_msg = 'IDs do not match between observed and ' err_msg += 'predicted data' raise ValueError(err_msg) for attr in attrs: arr1 = getattr(obs_data, attr) arr2 = getattr(prd_data, attr) rv = RiemannVariable(arr1, arr2) gmfr_stats = rv.gmfr_statistics() for stat in ('gmfr_a', 'gmfr_b', 'ac', 'ac_sys', 'ac_uns'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), gmfr_stats[stat]) stats_fh.write(stat_line) ks_stats = rv.ks_statistics() for stat in ('ks_max', 'ks_mean'): stat_line = '%s,%d,%s,%s,%.4f\n' % (c.prefix.upper(), c.k, attr, stat.upper(), ks_stats[stat]) stats_fh.write(stat_line)
def extract_data(pattern, query, headers, ctypes=None, fname=''): """ Extract data from CSV files whose name matches pattern. Every record in a given file is checked if it satisfies the query condition(s). If the query condition(s) are satisfied, data from the columns specified by headers are extracted from that record. Collected records are returned in a numpy record array and, if a filename fname is specified, they are also written to fn in tab-separated CSV format. If no matching records are found an empty record array of type bool is returned. argument: comment: pattern a file name pattern for files from which records are to be extracted query conditions in the form of a dictionary (list of key-value pairs) that need to be fulfilled for a record to be extracted; the query dictionary is specified in one of these forms: dict(k1=v1, k2=v2, k3=v3, ...) dict({"k1":v1, "k2":v2, "k3":v3, ...}) {"k1":v1, "k2":v2, "k3":v3, ...} headers a list of strings specifying the column headers for the columns which are to be extracted ctypes if not None, is a dictionary mapping column number or munged column name to a converter function; the column type converter dictionary can be specified as: {"k1":t1, "k2":t2, "k3":t3, ...} where the t can be, e.g., str, int, float, bool. fname if defined, the name of the CSV file (tab-separated) to which extracted records are written """ # get all file names that match pattern infiles = glob.glob(pattern) infiles.sort() # determine the query and header keys (in lowercase because csv2rec # lowercases headers), and query values qkeys = query.keys() qlckeys = [x.lower() for x in qkeys] qvalues = query.values() hlckeys = [x.lower() for x in headers] if ctypes: ctypes_lc = dict( (key.lower(), value) for (key, value) in ctypes.items()) else: ctypes_lc = None mkeys = set(qlckeys) mkeys = mkeys.union(hlckeys) mrows = [] # check files for query patterns for f in infiles: d = mlab.csv2rec(f, delimiter='\t', converterd=ctypes_lc) # check if the data contain the necessary columns if mkeys <= set(d.dtype.names): # find the records that match the query darray = mlab.rec_keep_fields(d, qlckeys) imatch = np.array([False] * darray.size) for i in range(darray.size): if list(darray[i]) == qvalues: imatch[i] = True # get data from records that matched the query if any(imatch): marray = mlab.rec_keep_fields(d, hlckeys)[imatch] for row in marray: mrows.append(row.tolist()) # write data from matching records to file if requested and return results if mrows: # The following does not work because the mlab.csv2rec() converterd # data type specifications are different from the # np.core.records.fromrecords() dtype data type specifications ... #results = np.core.records.fromrecords(mrows, dtype=ctypes_lc) # ... so, for now we cross our fingers and hope that # np.core.records.fromrecords() intuits the data types correctly, which # it seems to do (most of the time) results = np.core.records.fromrecords(mrows, names=headers) else: dt = [(h, bool) for h in headers] results = np.recarray(0, dtype=dt) if fname != '': mlab.rec2csv(results, fname, delimiter='\t') return results
def write_baseline_file(recarray, track): ''' write a simple ascii file with date and baseline columns ''' subset = mlab.rec_keep_fields(recarray, ['roidate','bperp']) mlab.rec2csv(subset,'baselines.txt'.format(track), withheader=False, delimiter=' ')
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = 'Species and environment plot IDs do not match' raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == 'SQRT': spp = np.sqrt(spp) elif self.species_transform == 'LOG': spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, 'w') # Eigenvalues numpy_fh.write('### Eigenvalues ###\n') for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write('RDA' + str(i + 1) + ',' + '%.10f' % e + '\n') numpy_fh.write('\n') # Print out variable means numpy_fh.write('### Variable Means ###\n') for (i, m) in enumerate(cca.env_means): numpy_fh.write('%s,%.10f\n' % (self.variables[i], m)) numpy_fh.write('\n') # Print out environmental coefficients loadings numpy_fh.write('### Coefficient Loadings ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, c) in enumerate(cca.coefficients()): coeff = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (self.variables[i], coeff)) numpy_fh.write('\n') # Print out biplot scores numpy_fh.write('### Biplot Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, b) in enumerate(cca.biplot_scores()): scores = ','.join(['%.10f' % x for x in b]) numpy_fh.write('%s,%s\n' % (self.variables[i], scores)) numpy_fh.write('\n') # Print out species centroids numpy_fh.write('### Species Centroids ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, c) in enumerate(cca.species_centroids()): scores = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out species tolerances numpy_fh.write('### Species Tolerances ###\n') header_str = \ ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, t) in enumerate(cca.species_tolerances()): scores = ','.join(['%.21f' % x for x in t]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out miscellaneous species information numpy_fh.write('### Miscellaneous Species Information ###\n') numpy_fh.write('SPECIES,WEIGHT,N2\n') species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write( '%s,%.10f,%.10f\n' % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write('\n') # Print out site LC scores numpy_fh.write('### Site LC Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_lc_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Print out site WA scores numpy_fh.write('### Site WA Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_wa_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Miscellaneous site information numpy_fh.write('### Miscellaneous Site Information ###\n') numpy_fh.write('ID,WEIGHT,N2\n') site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write('%s,%.10f,%.10f\n' % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def extract_data(pattern, query, headers, ctypes=None, fname=''): """ Extract data from CSV files whose name matches pattern. Every record in a given file is checked if it satisfies the query condition(s). If the query condition(s) are satisfied, data from the columns specified by headers are extracted from that record. Collected records are returned in a numpy record array and, if a filename fname is specified, they are also written to fn in tab-separated CSV format. If no matching records are found an empty record array of type bool is returned. argument: comment: pattern a file name pattern for files from which records are to be extracted query conditions in the form of a dictionary (list of key-value pairs) that need to be fulfilled for a record to be extracted; the query dictionary is specified in one of these forms: dict(k1=v1, k2=v2, k3=v3, ...) dict({"k1":v1, "k2":v2, "k3":v3, ...}) {"k1":v1, "k2":v2, "k3":v3, ...} headers a list of strings specifying the column headers for the columns which are to be extracted ctypes if not None, is a dictionary mapping column number or munged column name to a converter function; the column type converter dictionary can be specified as: {"k1":t1, "k2":t2, "k3":t3, ...} where the t can be, e.g., str, int, float, bool. fname if defined, the name of the CSV file (tab-separated) to which extracted records are written """ # get all file names that match pattern infiles = glob.glob(pattern) infiles.sort() # determine the query and header keys (in lowercase because csv2rec # lowercases headers), and query values qkeys = query.keys() qlckeys = [x.lower() for x in qkeys] qvalues = query.values() hlckeys = [x.lower() for x in headers] if ctypes: ctypes_lc = dict((key.lower(), value) for (key, value) in ctypes.items()) else: ctypes_lc = None mkeys = set(qlckeys) mkeys = mkeys.union(hlckeys) mrows = [] # check files for query patterns for f in infiles: d = mlab.csv2rec(f, delimiter='\t', converterd=ctypes_lc) # check if the data contain the necessary columns if mkeys <= set(d.dtype.names): # find the records that match the query darray = mlab.rec_keep_fields(d, qlckeys) imatch = np.array([False]*darray.size) for i in range(darray.size): if list(darray[i]) == qvalues: imatch[i] = True # get data from records that matched the query if any(imatch): marray = mlab.rec_keep_fields(d, hlckeys)[imatch] for row in marray: mrows.append(row.tolist()) # write data from matching records to file if requested and return results if mrows: # The following does not work because the mlab.csv2rec() converterd # data type specifications are different from the # np.core.records.fromrecords() dtype data type specifications ... #results = np.core.records.fromrecords(mrows, dtype=ctypes_lc) # ... so, for now we cross our fingers and hope that # np.core.records.fromrecords() intuits the data types correctly, which # it seems to do (most of the time) results = np.core.records.fromrecords(mrows, names=headers) else: dt = [(h, bool) for h in headers] results = np.recarray(0, dtype=dt) if fname != '': mlab.rec2csv(results, fname, delimiter='\t') return results