def test_design(): # Check that you get the design matrix we expect t1 = F.Term("x") t2 = F.Term('y') n = F.make_recarray([2,4,5], 'x') yield assert_almost_equal, t1.formula.design(n)['x'], n['x'] f = t1.formula + t2.formula n = F.make_recarray([(2,3),(4,5),(5,6)], 'xy') yield assert_almost_equal, f.design(n)['x'], n['x'] yield assert_almost_equal, f.design(n)['y'], n['y'] f = t1.formula + t2.formula + F.I + t1.formula * t2.formula yield assert_almost_equal, f.design(n)['x'], n['x'] yield assert_almost_equal, f.design(n)['y'], n['y'] yield assert_almost_equal, f.design(n)['1'], 1 yield assert_almost_equal, f.design(n)['x*y'], n['x']*n['y'] ny = ML.rec_drop_fields(n, 'y') yield assert_raises, ValueError, f.design, ny n = np.array([(2,3,'a'),(4,5,'b'),(5,6,'a')], np.dtype([('x', np.float), ('y', np.float), ('f', 'S1')])) f = F.Factor('f', ['a','b']) ff = t1.formula * f + F.I yield assert_almost_equal, ff.design(n)['f_a*x'], n['x']*[1,0,1] yield assert_almost_equal, ff.design(n)['f_b*x'], n['x']*[0,1,0] yield assert_almost_equal, ff.design(n)['1'], 1
def gethistprices(query, numrows=1000, **kwargs): rec_arr = sqlite2rec(query, **kwargs) import matplotlib.mlab as mlab import numpy as np (syms, posuniq, pos) = np.unique(rec_arr.sym, True, True) new_rec_arr = mlab.rec_append_fields(rec_arr, 'idx', pos) nosym = mlab.rec_drop_fields(new_rec_arr, ['sym',]) recnumrecs = mlab.rec_groupby(nosym, ('idx',), (('idx', len, 'idxcount'), )) idx = np.nonzero(recnumrecs.idxcount >= numrows)[0] idxcount = len(recnumrecs[idx]) xs = np.empty((idxcount, numrows, len(nosym[0])-1), dtype=float) for i in xrange(idxcount): if kwargs.has_key('verbose') and kwargs['verbose'] and i % 50 == 0: print '%d of %d' % (i, idxcount) curdata = nosym[nosym.idx == idx[i]] curdata_arr = np.array(curdata.tolist(), dtype=float) xs[i] = curdata_arr[0:numrows:,0:-1] return (syms[idx], xs)
def join_rec(r1,field1,r2,field2): """1-to-1 joining with non-unique lefthand side keys""" mapping = dict(zip(r2[field2], range(len(r2)))) diff = np.setdiff1d(r1[field1],r2[field2]) r2len = len(r2) if len(diff) > 0: print "WARNING: %s no matching key: %s" % (field2, diff) for i in range(len(diff)): mapping[diff[i]]=r2len r2copy = mlab.rec_drop_fields(r2, (field2,)) r2copy.resize(r2len+1) joinfields = list(r2copy.dtype.names) dtypes = [] for i in range(len(joinfields)): if r2copy.dtype[i].kind == "i": dtypes.append(np.double) else: dtypes.append(r2copy.dtype[i]) if r2copy.dtype[i].kind == "f": r2copy[r2copy.dtype.names[i]][-1]=NULL_VALUE while joinfields[i] in r1.dtype.names: joinfields[i] = joinfields[i]+"_" rightrec = r2copy[[mapping[key] for key in r1[field1]]] r1 = mlab.rec_append_fields(r1, joinfields, [rightrec[n] for n in rightrec.dtype.names], dtypes) return r1
def writefiles(tiles, fnbase, overwrite=False): from astropy.io import fits from astropy.io import ascii from matplotlib.mlab import rec_drop_fields from astropy import table fits.writeto(fnbase+'.fits', tiles, overwrite=overwrite) hdulist = fits.open(fnbase+'.fits', mode='update') hdulist[1].header['EXTNAME'] = 'TILES' hdulist.close() tilestab = table.Table( rec_drop_fields(tiles, ['brightra', 'brightdec', 'brightvtmag'])) metadata = {'tileid': ('', 'Unique tile ID'), 'ra': ('deg', 'Right ascension'), 'dec': ('deg', 'Declination'), 'pass': ('', 'DESI layer'), 'in_desi': ('', '1=within DESI footprint; 0=outside'), 'ebv_med':('mag', 'Median Galactic E(B-V) extinction in tile'), 'airmass':('', 'Airmass if observed at hour angle 15 deg'), 'star_density':('deg^-2', 'median number density of Gaia stars brighter than 19.5 mag in tile'), 'exposefac':('', 'Multiplicative exposure time factor from airmass and E(B-V)'), 'program':('', 'DARK, GRAY, BRIGHT, or EXTRA'), 'obsconditions':('', '1 for DARK, 2 for GRAY, 4 for BRIGHT, 0 for EXTRA'), 'brightra':('deg', 'RAs of 3 brightest Tycho-2 stars in tile'), 'brightdec':('deg', 'Decs of 3 brightest Tycho-2 stars in tile'), 'brightvtmag':('mag', 'V_T magnitudes of 3 brightest Tycho-2 stars in tile'), 'centerid':('', 'Unique tile ID of pass 0 tile corresponding to this tile'), } from astropy import units as u unitdict = {'': None, 'deg': u.deg, 'mag': u.mag, 'deg^-2': 1/u.mag/u.mag} for name in tilestab.dtype.names: tilestab[name].unit = unitdict[metadata[name][0]] tilestab[name].description = metadata[name][1] ascii.write(tilestab, fnbase+'.ecsv', format='ecsv', overwrite=overwrite)
def subj_by_subj_map_init(self, runs=2, verbose=-1, **map_kwargs): """ initializing nodes by finding the MAP for each subject separately Input: runs - number of MAP runs for each subject map_kwargs - other arguments that will be passes on to the map function Note: This function should be run prior to the nodes creation, i.e. before running mcmc() or map() """ # check if nodes were created. if they were it cause problems for deepcopy assert (not self.nodes), "function should be used before nodes are initialized." # init subjs = self._subjs n_subjs = len(subjs) empty_s_model = deepcopy(self) empty_s_model.is_group_model = False del empty_s_model._num_subjs, empty_s_model._subjs, empty_s_model.data self.create_nodes() # loop over subjects for i_subj in range(n_subjs): # create and fit single subject if verbose > 1: print "*!*!* fitting subject %d *!*!*" % subjs[i_subj] t_data = self.data[self.data['subj_idx'] == subjs[i_subj]] t_data = rec_drop_fields(t_data, ['data_idx']) s_model = deepcopy(empty_s_model) s_model.data = t_data s_model.map(method='fmin_powell', runs=runs, **map_kwargs) # copy to original model for (name, node) in s_model.group_nodes.iteritems(): self.subj_nodes[name][i_subj].value = node.value #set group and var nodes for (param_name, d) in self.params_dict.iteritems(): for (tag, nodes) in d.subj_nodes.iteritems(): subj_values = [x.value for x in nodes] #set group node if d.group_nodes: d.group_nodes[tag].value = np.mean(subj_values) #set var node if d.var_nodes: if d.var_type == 'std': d.var_nodes[tag].value = np.std(subj_values) elif d.var_type == 'precision': d.var_nodes[tag].value = np.std(subj_values)**-2 elif d.var_type == 'sample_size': v = np.var(subj_values) m = np.mean(subj_values) d.var_nodes[tag].value = (m * (1 - m)) / v - 1 else: raise ValueError, "unknown var_type"
def load_aeronet(fname, keep_fields='all', header=False): """loads aeronet lev 2.0 csv file. fname: data file name keep_fields: 'all' or a list of fields header: whether to return header information along with the data. """ std_day = datetime(1900,1,1,0,0,0) def date2daynum(datestr): the_day = datetime.strptime(datestr, '%d:%m:%Y') return float((the_day - std_day).days) def time2seconds(timestr): h, m, s = [int(t) for t in timestr.split(':')] return float(h * 3600 + m * 60 + s) def daynum_seconds2datetime(daynum, seconds): return std_day + timedelta(days=int(daynum), seconds=int(seconds)) headlines = [] f = open(fname, 'r') for line_i, line in enumerate(f): line = line.rstrip() if line.startswith('Date(dd-mm-yy'): datefield, timefield = [re.sub(r'\W', '', tk) for tk in line.split(',')[0:2]] break headlines.append(line) skip_header_lines = line_i if header: headline = ','.join(headlines) headerd = dict() for attrname, converter in [('location', str), ('long', float), ('lat', float), ('elev', float), ('nmeas', int), ('PI', str), ('email', str)]: m = re.search(r'%s.{0,1}=([^,\s]*)' % attrname, headline, flags=re.I) if m: try: headerd[attrname] = converter(m.group(1)) except Exception: pass rawd = np.genfromtxt(fname, skip_header=skip_header_lines, delimiter=',', names=True, converters={0:date2daynum, 1:time2seconds}) lend = len(rawd) dates = np.zeros(len(rawd), dtype='O') for i in range(lend): dates[i] = daynum_seconds2datetime(rawd[datefield][i], rawd[timefield][i]) newd = mlab.rec_append_fields(rawd, 'datetime', dates) newd = mlab.rec_drop_fields(newd, [datefield, timefield, 'Last_Processing_Date']) if keep_fields is not 'all': keep_fields = ['datetime'] + keep_fields # print keep_fields newd = mlab.rec_keep_fields(newd, keep_fields) if header: return newd, headerd else: return newd
def to_hdf(ec,h5file): with h5plus.File(h5file) as h5: for k in dsetkeys: h5[k] = getattr(ec,k) r = ec.dfAc_st.to_records() rless = mlab.rec_drop_fields(r,['index']) sindex = r['index'].astype(str) r = mlab.rec_append_fields(rless,'index', sindex) h5.attrs['dfAc_st'] = r h5.attrs['kAs'] = ec.kAs
def atpy2h5(files, out, diff='all', name='ds'): """ atpy format to h5 Parameters ---------- inp : globable string specifying where the input files are out : output h5 file. In none exists, we create it. diff : List of fields that are stored as stacked arrays. Those that are not different, we store the first element. """ nfiles = len(files) t0 = atpy.Table(files[0]) h5 = File(out) ds, ds1d = diffDS(t0.table_name, t0.data.dtype, (nfiles, t0.data.size), h5, diff=diff) kicL = [] nFail = 0 # import pdb;pdb.set_trace() for i in range(nfiles): if np.mod(i, 100) == 0: print i try: hdu = pyfits.open(files[i]) data = hdu[1].data kic = hdu[1].header['KEPLERID'] assert type(kic) == int kicL.append(kic) if diff != 'all': data = mlab.rec_keep_fields(data, diff) ds1d[:] = mlab.rec_drop_fields(data, diff) ds[i - nFail] = data except: print sys.exc_info()[1] nFail += 1 ds.resize(ds.shape[0] - nFail, axis=0) kicL = np.array(kicL) h5.create_dataset('KIC', data=kicL) print "%i files failed" % nFail h5.close()
def writefiles(tiles, fnbase, overwrite=False): from astropy.io import fits from astropy.io import ascii from matplotlib.mlab import rec_drop_fields from astropy import table # under duress... uppercase tiles.dtype.names = [n.upper() for n in tiles.dtype.names] fits.writeto(fnbase + '.fits', tiles, overwrite=overwrite) hdulist = fits.open(fnbase + '.fits', mode='update') hdulist[1].header['EXTNAME'] = 'TILES' hdulist.close() tilestab = table.Table( rec_drop_fields(tiles, ['BRIGHTRA', 'BRIGHTDEC', 'BRIGHTVTMAG'])) metadata = { 'tileid': ('', 'Unique tile ID'), 'ra': ('deg', 'Right ascension'), 'dec': ('deg', 'Declination'), 'pass': ('', 'DESI layer'), 'in_desi': ('', '1=within DESI footprint; 0=outside'), 'ebv_med': ('mag', 'Median Galactic E(B-V) extinction in tile'), 'airmass': ('', 'Airmass if observed at hour angle 15 deg'), 'star_density': ('deg^-2', 'median number density of Gaia stars brighter than 19.5 mag in tile'), 'exposefac': ('', 'Multiplicative exposure time factor from airmass and E(B-V)'), 'program': ('', 'DARK, GRAY, BRIGHT, or EXTRA'), 'obsconditions': ('', '1 for DARK, 2 for GRAY, 4 for BRIGHT, 0 for EXTRA'), 'brightra': ('deg', 'RAs of 3 brightest Tycho-2 stars in tile'), 'brightdec': ('deg', 'Decs of 3 brightest Tycho-2 stars in tile'), 'brightvtmag': ('mag', 'V_T magnitudes of 3 brightest Tycho-2 stars in tile'), 'centerid': ('', 'Unique tile ID of pass 0 tile corresponding to this tile'), } metadatacaps = {k.upper(): v for k, v in metadata.items()} from astropy import units as u unitdict = { '': None, 'deg': u.deg, 'mag': u.mag, 'deg^-2': 1 / u.mag / u.mag } for name in tilestab.dtype.names: tilestab[name].unit = unitdict[metadatacaps[name][0]] tilestab[name].description = metadatacaps[name][1] ascii.write(tilestab, fnbase + '.ecsv', format='ecsv', overwrite=overwrite)
def atpy2h5(files,out,diff='all',name='ds'): """ atpy format to h5 Parameters ---------- inp : globable string specifying where the input files are out : output h5 file. In none exists, we create it. diff : List of fields that are stored as stacked arrays. Those that are not different, we store the first element. """ nfiles = len(files) t0 = atpy.Table(files[0]) h5 = File(out) ds,ds1d = diffDS(t0.table_name,t0.data.dtype,(nfiles,t0.data.size) ,h5,diff=diff) kicL = [] nFail = 0 # import pdb;pdb.set_trace() for i in range(nfiles): if np.mod(i,100)==0: print i try: hdu = pyfits.open(files[i]) data = hdu[1].data kic = hdu[1].header['KEPLERID'] assert type(kic) == int kicL.append(kic) if diff!='all': data = mlab.rec_keep_fields(data,diff) ds1d[:] = mlab.rec_drop_fields(data,diff) ds[i-nFail] = data except: print sys.exc_info()[1] nFail +=1 ds.resize(ds.shape[0]-nFail,axis=0) kicL = np.array(kicL) h5.create_dataset('KIC',data=kicL) print "%i files failed" % nFail h5.close()
def modcols(r0): """ Modify Columns 1. Changes TIME, CADENCENO to t, cad 2. rnQ - normalize quarter 3. rnanTime - remove nans from time series """ r = r0.copy() oldName = ['TIME', 'CADENCENO'] newName = ['t', 'cad'] for o, n in zip(oldName, newName): r = mlab.rec_append_fields(r, n, r[o]) r = mlab.rec_drop_fields(r, o) r = keplerio.rnQ(r) r = keplerio.rnanTime(r) return r
def create_info_table(self, raster_join_field, attribute_file, attribute_join_field, drop_fields=None): """ Create ArcInfo table from attribute csv file Parameters ---------- raster : str name of raster to join attributes to raster_join_field : str field in raster to use for joining to attribute data attribute_file : str name and path of file containing attribute information attribute_join_field : str field in attribute file to use to join to raster drop_fields : list of str fields in the attribute file to drop before join to raster Returns ------- name of temp ArcInfo table, list of fields to join from info table """ print 'Building info table from attribute file' # Crosswalk of numpy types to ESRI types for numeric data numpy_to_esri_type = { ('b', 1): 'SHORT', ('i', 1): 'SHORT', ('i', 2): 'SHORT', ('i', 4): 'LONG', ('f', 4): 'FLOAT', ('f', 8): 'DOUBLE', } # Read the CSV file in to a recarray ra = mlab.csv2rec(attribute_file) col_names = [x.upper() for x in ra.dtype.names] ra.dtype.names = col_names # If there are fields to drop, do that now and get a new recarray if drop_fields is not None: # Ensure that the drop fields are actually fields in the current # recarray drop_fields = [x for x in drop_fields if x in ra.dtype.names] # Create a new recarray with these fields omitted ra = mlab.rec_drop_fields(ra, drop_fields) col_names = list(ra.dtype.names) # Get the column types and formats col_types = \ [(ra.dtype[i].kind, ra.dtype[i].itemsize) for i in xrange(len(ra.dtype))] formats = [ra.dtype[i].str for i in xrange(len(ra.dtype))] # Sanitize column names # No field name may be longer than 16 chars # No field name can start with a number for i in xrange(len(col_names)): if len(col_names[i]) > 16: col_names[i] = col_names[i][0:16] if col_names[i][0].isdigit(): col_names[i] = col_names[i].lstrip('0123456789') # Reset the names for the recarray ra.dtype.names = col_names # Sanitize the data # Change True/False to 1/0 to be read into short type bit_fields = [(i, n) for (i, (n, t)) in enumerate(zip(col_names, col_types)) if t[0] == 'b'] if bit_fields: for rec in ra: for (col_num, field) in bit_fields: value = getattr(rec, field) if value == True: setattr(rec, field, 1) else: setattr(rec, field, 0) # Change the bit fields to be short integer for (col_num, field) in bit_fields: formats[col_num] = '<i2' # Create a sanitized recarray and output back to CSV temp_csv = os.path.join(env.workspace, 'xxtmp.csv') ra2 = np.rec.fromrecords(ra, names=col_names, formats=formats) mlab.rec2csv(ra2, temp_csv) # Create a scratch name for the temporary ArcInfo table temp_table = arcpy.CreateScratchName('', '', 'ArcInfoTable') # Create the ArcInfo table and add the fields table_name = os.path.basename(temp_table) arcpy.CreateTable_management(env.workspace, table_name) for (n, t) in zip(col_names, col_types): try: esri_type = numpy_to_esri_type[t] arcpy.AddField_management(temp_table, n, esri_type) except KeyError: if t[0] == 'S': arcpy.AddField_management(temp_table, n, 'TEXT', '#', '#', t[1]) else: err_msg = 'Type not found for ' + str(t) print err_msg continue # Append the records from the CSV field to the temporary INFO table arcpy.Append_management(temp_csv, temp_table, 'NO_TEST') # Strip out the join field from the names if they are the same raster_join_field = raster_join_field.upper() attribute_join_field = attribute_join_field.upper() if raster_join_field == attribute_join_field: col_names.remove(attribute_join_field) # Create a semi-colon delimited string of the fields we want to join field_list = ';'.join(col_names) # Clean up os.remove(temp_csv) return temp_table, field_list
def create_info_table(self, raster_join_field, attribute_file, attribute_join_field, drop_fields=None): """ Create ArcInfo table from attribute csv file Parameters ---------- raster_join_field : str field in raster to use for joining to attribute data attribute_file : str name and path of file containing attribute information attribute_join_field : str field in attribute file to use to join to raster drop_fields : list of str fields in the attribute file to drop before join to raster Returns ------- name of temp ArcInfo table, list of fields to join from info table """ print('Building info table from attribute file') # Crosswalk of numpy types to ESRI types for numeric data numpy_to_esri_type = { ('b', 1): 'SHORT', ('i', 1): 'SHORT', ('i', 2): 'SHORT', ('i', 4): 'LONG', ('f', 4): 'FLOAT', ('f', 8): 'DOUBLE', } # Read the CSV file in to a recarray ra = mlab.csv2rec(attribute_file) col_names = [str(x).upper() for x in ra.dtype.names] ra.dtype.names = col_names # If there are fields to drop, do that now and get a new recarray if drop_fields is not None: # Ensure that the drop fields are actually fields in the current # recarray drop_fields = [x for x in drop_fields if x in ra.dtype.names] # Create a new recarray with these fields omitted ra = mlab.rec_drop_fields(ra, drop_fields) col_names = list(ra.dtype.names) # Get the column types and formats col_types = [(ra.dtype[i].kind, ra.dtype[i].itemsize) for i in range(len(ra.dtype))] formats = [ra.dtype[i].str for i in range(len(ra.dtype))] # Sanitize column names # No field name may be longer than 16 chars # No field name can start with a number for i in range(len(col_names)): if len(col_names[i]) > 16: col_names[i] = col_names[i][0:16] if col_names[i][0].isdigit(): col_names[i] = col_names[i].lstrip('0123456789') # Reset the names for the recarray ra.dtype.names = col_names # Sanitize the data # Change True/False to 1/0 to be read into short type bit_fields = [(i, n) for (i, (n, t)) in enumerate(zip(col_names, col_types)) if t[0] == 'b'] if bit_fields: for rec in ra: for (col_num, field) in bit_fields: value = getattr(rec, field) if value: setattr(rec, field, 1) else: setattr(rec, field, 0) # Change the bit fields to be short integer for (col_num, field) in bit_fields: formats[col_num] = '<i2' # Create a sanitized recarray and output back to CSV temp_csv = os.path.join(env.workspace, 'xxtmp.csv') ra2 = np.rec.fromrecords(ra, names=col_names, formats=formats) mlab.rec2csv(ra2, temp_csv) # Create a scratch name for the temporary ArcInfo table temp_table = arcpy.CreateScratchName('', '', 'ArcInfoTable') # Create the ArcInfo table and add the fields table_name = os.path.basename(temp_table) arcpy.CreateTable_management(env.workspace, table_name) for (n, t) in zip(col_names, col_types): try: esri_type = numpy_to_esri_type[t] arcpy.AddField_management(temp_table, n, esri_type) except KeyError: if t[0] == 'S': arcpy.AddField_management(temp_table, n, 'TEXT', '#', '#', t[1]) else: err_msg = 'Type not found for ' + str(t) print(err_msg) continue # Append the records from the CSV field to the temporary INFO table arcpy.Append_management(temp_csv, temp_table, 'NO_TEST') # Strip out the join field from the names if they are the same raster_join_field = raster_join_field.upper() attribute_join_field = attribute_join_field.upper() if raster_join_field == attribute_join_field: col_names.remove(attribute_join_field) # Create a semi-colon delimited string of the fields we want to join field_list = ';'.join(col_names) # Clean up os.remove(temp_csv) return temp_table, field_list
def to_matrix(rec): new_rec = mlab.rec_drop_fields(rec, ['Date']) new_mat = np.c_[[new_rec[nm] for nm in new_rec.dtype.names]].T return new_mat
def _create_story(self): # Set up an empty list to hold the story story = [] # Import the report styles styles = report_styles.get_report_styles() # Create a page break story = self._make_page_break(story, self.LANDSCAPE) # This class is somewhat of a hack, in that it likely only works on # rotated paragraphs which fit into the desired cell area class RotatedParagraph(p.Paragraph): def wrap(self, availHeight, availWidth): h, w = \ p.Paragraph.wrap(self, self.canv.stringWidth(self.text), self.canv._leading) return w, h def draw(self): self.canv.rotate(90) self.canv.translate(0.0, -10.0) p.Paragraph.draw(self) # Section title title_str = '<strong>Local-Scale Accuracy Assessment: ' title_str += 'Error Matrix for Vegetation Classes at Plot ' title_str += 'Locations</strong>' para = p.Paragraph(title_str, styles['section_style']) t = p.Table([[para]], colWidths=[10.0 * u.inch]) t.setStyle( p.TableStyle([ ('TOPPADDING', (0, 0), (-1, -1), 3), ('BOTTOMPADDING', (0, 0), (-1, -1), 3), ('BACKGROUND', (0, 0), (-1, -1), '#957348'), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('GRID', (0, 0), (-1, -1), 0.25, colors.black), ])) story.append(t) story.append(p.Spacer(0, 0.1 * u.inch)) # Read in the vegclass error matrix names = ['P_' + str(x) for x in range(1, 12)] names.insert(0, 'OBSERVED') names.extend(['TOTAL', 'CORRECT', 'FUZZY_CORRECT']) vc_data = mlab.csv2rec(self.vc_errmatrix_file, skiprows=1, names=names) vc_data = mlab.rec_drop_fields(vc_data, ['OBSERVED']) # Read in the stand attribute metadata mp = xsmp.XMLStandMetadataParser(self.stand_metadata_file) # Get the class names from the metadata vegclass_metadata = mp.get_attribute('VEGCLASS') vc_codes = vegclass_metadata.codes # Create a list of lists to hold the vegclass table vegclass_table = [] # Add an empty row which will be a span row for the predicted label header_row = [] for i in xrange(2): header_row.append('') prd_str = '<strong>Predicted Class</strong>' para = p.Paragraph(prd_str, styles['body_style_10_center']) header_row.append(para) for i in xrange(len(vc_data) - 1): header_row.append('') vegclass_table.append(header_row) # Add the predicted labels summary_labels = ('Total', '% Correct', '% FCorrect') header_row = [] for i in xrange(2): header_row.append('') for code in vc_codes: label = re.sub('-', '-<br/>', code.label) para = p.Paragraph(label, styles['body_style_10_right']) header_row.append(para) for label in summary_labels: label = re.sub(' ', '<br/>', label) para = p.Paragraph(label, styles['body_style_10_right']) header_row.append(para) vegclass_table.append(header_row) # Set a variable to distinguish between plot counts and percents # in order to format them differently format_break = 11 # Set the cells which should be blank blank_cells = \ [(11, 12), (11, 13), (12, 11), (12, 13), (13, 11), (13, 12)] # Add the data for (i, row) in enumerate(vc_data): vegclass_row = [] for (j, elem) in enumerate(row): # Blank cells if (i, j) in blank_cells: elem_str = '' # Cells that represent plot counts elif i <= format_break and j <= format_break: elem_str = '%d' % int(elem) # Cells that represent percentages else: elem_str = '%.1f' % float(elem) para = p.Paragraph(elem_str, styles['body_style_10_right']) vegclass_row.append(para) # Add the observed labels at the beginning of each data row if i == 0: obs_str = '<strong>Observed Class</strong>' para = \ RotatedParagraph(obs_str, styles['body_style_10_center']) else: para = '' vegclass_row.insert(0, para) if i < len(vc_codes): label = vc_codes[i].label else: index = i - len(vc_codes) label = summary_labels[index] para = p.Paragraph(label, styles['body_style_10_right']) vegclass_row.insert(1, para) # Add this row to the table vegclass_table.append(vegclass_row) # Set up the widths for the table cells widths = [] widths.append(0.3) widths.append(0.85) for i in xrange(len(vc_codes)): widths.append(0.56) for i in xrange(3): widths.append(0.66) widths = [x * u.inch for x in widths] # Convert the vegclass table into a reportlab table t = p.Table(vegclass_table, colWidths=widths) t.setStyle( p.TableStyle([ ('SPAN', (0, 0), (1, 1)), ('SPAN', (0, 2), (0, -1)), ('SPAN', (2, 0), (-1, 0)), ('BACKGROUND', (0, 0), (-1, -1), '#f1efe4'), ('GRID', (0, 0), (-1, -1), 1, colors.white), ('ALIGNMENT', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('VALIGN', (0, 2), (0, -1), 'MIDDLE'), ('VALIGN', (2, 1), (-1, 1), 'MIDDLE'), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 3), ])) # Set up the shading for the truly correct cells correct = {} for i in xrange(len(vc_codes)): val = i + 1 correct[val] = val for key in correct: val = correct[key] t.setStyle( p.TableStyle([ ('BACKGROUND', (key + 1, val + 1), (key + 1, val + 1), '#aaaaaa'), ])) # Set up the shading for the fuzzy correct cells fuzzy = {} fuzzy[1] = [2] fuzzy[2] = [1, 3, 5, 8] fuzzy[3] = [2, 4, 5] fuzzy[4] = [3, 6, 7] fuzzy[5] = [2, 3, 6, 8] fuzzy[6] = [4, 5, 7, 9] fuzzy[7] = [4, 6, 10, 11] fuzzy[8] = [2, 5, 9] fuzzy[9] = [6, 8, 10] fuzzy[10] = [7, 9, 11] fuzzy[11] = [7, 10] for key in fuzzy: for elem in fuzzy[key]: t.setStyle( p.TableStyle([ ('BACKGROUND', (key + 1, elem + 1), (key + 1, elem + 1), '#dddddd'), ])) # Add this table to the story story.append(t) story.append(p.Spacer(0, 0.1 * u.inch)) # Explanation and definitions of vegetation class categories cell_str = """ Cell values are model plot counts. Dark gray cells represent plots where the observed class matches the predicted class and are included in the percent correct. Light gray cells represent cases where the observed and predicted differ slightly (within +/- one class) based on canopy cover, hardwood proportion or average stand diameter, and are included in the percent fuzzy correct. """ para = p.Paragraph(cell_str, styles['body_style_9']) story.append(para) story.append(p.Spacer(0, 0.1 * u.inch)) head_str = ''' <strong>Vegetation Class (VEGCLASS) Definitions</strong> -- CANCOV (canopy cover of all live trees), BAH_PROP (proportion of hardwood basal area), and QMD_DOM (quadratic mean diameter of all dominant and codominant trees). ''' para = p.Paragraph(head_str, styles['body_style_9']) story.append(para) story.append(p.Spacer(0, 0.1 * u.inch)) # Print out the vegclass code definitions for code in vc_codes: label = code.label desc = self.txt_to_html(code.description) doc_str = '<strong>' + label + ':</strong> ' + desc para = p.Paragraph(doc_str, styles['body_style_9']) story.append(para) return story
def fit_classifier(aml_clean_path, class_path, test=False, performance=False, n_fits=100, test_split=0.2, save_clf=True): '''Fits random forest classifier to aml_ref_clean formatted csv. Note that the species code should be contained in the folder col.''' # Get class_path dir, used for ancillary file names class_dir, tail = os.path.split(class_path) prefix = tail.split('.')[0] # Load refe_features_table table = csv2rec(aml_clean_path) # Only use calls with qual < 0.3 (Armitage) table = table[table.qual < 0.3] # Get target col (y) with integer codes instead of spp names y_str = table.folder # Assumes spp name is in folder col y_str_uniq = set(list(y_str)) n_spp = len(y_str_uniq) spp_codes = range(0, n_spp) code_table = np.array(zip(spp_codes, y_str_uniq), dtype = [('code','<i8'), ('spp', '|S8')]) y = np.zeros(len(y_str)) # Get col of full length with codes, not names for code, spp in code_table: y[y_str == spp] = int(code) # Get filename col for later grouping into passes f = table.filename # Remove non-feature cols from table table = rec_drop_fields(table, ['path', 'folder', 'filename', 'st', 'dc', 'qual', 'pmc']) # Get list of feature names remaining in table feature_names = table.dtype.names # Recarray to ndarray - http://stackoverflow.com/questions/5957380/ # convert-structured-array-to-regular-numpy-array X = table.view((float, len(table.dtype.names))) # Partition data if test, holding portion for testing if not test: X_tr = X y_tr = y f_tr = f X_te = X y_te = y f_te = f else: # Use StratifiedShuffleSplit since train_test_split does not stratify sss = StratifiedShuffleSplit(y, 1, test_size=test_split) for train_index, test_index in sss: # Only once since n_iter=1 above X_tr, X_te = X[train_index], X[test_index] y_tr, y_te = y[train_index], y[test_index] f_tr, f_te = f[train_index], f[test_index] sort_ind = f_te.argsort() # Sort test data for pass analysis later X_te = X_te[sort_ind,:] # Sort rows y_te = y_te[sort_ind] f_te = f_te[sort_ind] # (Train data order does not matter) # Define and fit classifier clf = RandomForestClassifier(n_estimators=n_fits, oob_score=True, compute_importances=True) clf.fit(X_tr, y_tr) # If performance, save various performance metrics # NOTE: Performance of passes is difficult to understand if if test=True, # as the calls in one pass may be split up. if performance: # Get OOB score print 'OOB Score: ', clf.oob_score_ # Predict on test data, which may be held out (test=True) or all data y_te_pr = clf.predict(X_te) # Get true data and predictions by passes pred_te = clf.predict_proba(X_te) # Prob of each spp f_te_p, pred_te_p, other = sum_group(f_te, pred_te, [y_te]) y_te_p = other[0] # Actual spp for each pass y_te_p_pr = [] for row in xrange(len(y_te_p)): # Find pred species for each pass y_te_p_pr.append(pred_te_p[row].argmax()) # First ind, ties bias y_te_p_pr = np.array(y_te_p_pr) # Get accuracy and confusion matrix for calls def make_conf_mat(y_te, y_te_pr, type): conf_mat = metrics.confusion_matrix(y_te, y_te_pr) conf_mat_frac = conf_mat / np.sum(conf_mat, axis=0) print type, ' Accuracy: ', metrics.zero_one_score(y_te, y_te_pr) np.savetxt(os.path.join(class_dir, prefix+'_conf_'+type+'.csv'), conf_mat, fmt='%i', delimiter=',') np.savetxt(os.path.join(class_dir, prefix+'_conffr_'+type+'.csv'), conf_mat_frac, fmt = '%.6f', delimiter=',') make_conf_mat(y_te, y_te_pr, 'call') make_conf_mat(y_te_p, y_te_p_pr, 'pass') # Save spp_code table, feature_names, and pickle classifier rec2csv(code_table, os.path.join(class_dir, prefix + '_spp_codes.csv')) rec2csv(np.array(list(feature_names), dtype=[('features', 'S8')]), os.path.join(class_dir, prefix + '_feature_names.csv')) if save_clf: joblib.dump(clf, class_path, compress = 9)
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = "Species and environment plot IDs do not match" raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == "SQRT": spp = np.sqrt(spp) elif self.species_transform == "LOG": spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, "w") # Eigenvalues numpy_fh.write("### Eigenvalues ###\n") for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write("RDA" + str(i + 1) + "," + "%.10f" % e + "\n") numpy_fh.write("\n") # Print out variable means numpy_fh.write("### Variable Means ###\n") for (i, m) in enumerate(cca.env_means): numpy_fh.write("%s,%.10f\n" % (self.variables[i], m)) numpy_fh.write("\n") # Print out environmental coefficients loadings numpy_fh.write("### Coefficient Loadings ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, c) in enumerate(cca.coefficients()): coeff = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (self.variables[i], coeff)) numpy_fh.write("\n") # Print out biplot scores numpy_fh.write("### Biplot Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("VARIABLE," + header_str + "\n") for (i, b) in enumerate(cca.biplot_scores()): scores = ",".join(["%.10f" % x for x in b]) numpy_fh.write("%s,%s\n" % (self.variables[i], scores)) numpy_fh.write("\n") # Print out species centroids numpy_fh.write("### Species Centroids ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, c) in enumerate(cca.species_centroids()): scores = ",".join(["%.10f" % x for x in c]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out species tolerances numpy_fh.write("### Species Tolerances ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("SPECIES," + header_str + "\n") for (i, t) in enumerate(cca.species_tolerances()): scores = ",".join(["%.21f" % x for x in t]) numpy_fh.write("%s,%s\n" % (spp_ra.dtype.names[i], scores)) numpy_fh.write("\n") # Print out miscellaneous species information numpy_fh.write("### Miscellaneous Species Information ###\n") numpy_fh.write("SPECIES,WEIGHT,N2\n") species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write("\n") # Print out site LC scores numpy_fh.write("### Site LC Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_lc_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Print out site WA scores numpy_fh.write("### Site WA Scores ###\n") header_str = ",".join(["RDA%d" % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write("ID," + header_str + "\n") for (i, s) in enumerate(cca.site_wa_scores()): scores = ",".join(["%.10f" % x for x in s]) numpy_fh.write("%d,%s\n" % (spp_plot_ids[i], scores)) numpy_fh.write("\n") # Miscellaneous site information numpy_fh.write("### Miscellaneous Site Information ###\n") numpy_fh.write("ID,WEIGHT,N2\n") site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write("%s,%.10f,%.10f\n" % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def _postprocess(self, output_file, gu_poly, generator_list, overwrite=False, supplementary_figures=False, **kwargs): generatoroutputs = [] for g in generator_list: if supplementary_figures: gkw = g.G_kwargs else: gkw = {} generatoroutputs.append(g.gen(*g.G_args, overwrite = overwrite, **gkw)) gu_arr = gen_gu.gen(*gen_gu.G_args, overwrite=overwrite, **gen_gu.G_kwargs) print gu_arr print "merging arrays" out_arr = gen_merge.join_recs_on_keys(gu_arr, generatoroutputs, (BASIN_ID_FIELD, ADMIN_ID_FIELD, GW_ID_FIELD)) sr = ap.SpatialReference(PRJNAME) ap.Project_management(gu_poly,output_file,sr) print out_arr[BASIN_NAME_FIELD] missing_fields = np.setdiff1d(ALL_FIELDS,out_arr.dtype.names) if len(missing_fields)>0: print "WARNING: missing fields %s" % missing_fields obs = len(out_arr[GU_FIELD]) out_arr = mlab.rec_append_fields(out_arr, missing_fields, [np.repeat(np.nan, obs) for _ in missing_fields]) extra_fields = np.setdiff1d(out_arr.dtype.names,ALL_FIELDS) print "dropping extra fields %s" % extra_fields out_arr = mlab.rec_drop_fields(out_arr,extra_fields) print "generating pre-weighted_columns" if WEIGHTING_SCHEMES is not None: new_cols = [] names = [] for n, weights in WEIGHTING_SCHEMES.iteritems(): keys = weights.keys() values = weights.values() indicator_array = np.vstack([out_arr[f] for f in keys]).T indicator_array[indicator_array==NULL_VALUE] = np.nan scores = np.squeeze(np.asarray(aggregate_scores.aggregate_scores(indicator_array,values))) scores[np.isnan(scores)]=NULL_VALUE new_cols.append(scores) names.append(n) out_arr = mlab.rec_append_fields(out_arr, names, new_cols) for field in MAP_FIELDS: out_arr[field][out_arr[field]==""] = "No data" mlab.rec2csv(out_arr,"bin/test.csv") print "dropping fields" drop = [f.baseName for f in ap.ListFields(output_file) if not(f.required) and not(f.baseName == GU_FIELD)] if len(drop)>0: ap.DeleteField_management(output_file,drop) print "joining" ap.da.ExtendTable(output_file,GU_FIELD,out_arr,GU_FIELD) print "indexing" try: ap.AddSpatialIndex_management(output_file) ap.AddIndex_management(output_file,GU_FIELD,GU_FIELD,"UNIQUE") except Exception, e: print e
def run(self): # Convert the species and environment matrices to numpy rec arrays spp_ra = utilities.csv2rec(self.spp_file) env_ra = utilities.csv2rec(self.env_file) # Extract the plot IDs from both the species and environment matrices # and ensure that they are equal spp_plot_ids = getattr(spp_ra, self.id_field) env_plot_ids = getattr(env_ra, self.id_field) if not np.all(spp_plot_ids == env_plot_ids): err_msg = 'Species and environment plot IDs do not match' raise ValueError(err_msg) # Drop the ID column from both arrays spp_ra = mlab.rec_drop_fields(spp_ra, [self.id_field]) env_ra = mlab.rec_drop_fields(env_ra, [self.id_field]) # For the environment matrix, only keep the variables specified env_ra = mlab.rec_keep_fields(env_ra, self.variables) # Convert these matrices to pure floating point arrays spp = np.array([spp_ra[x] for x in spp_ra.dtype.names], dtype=float).T env = np.array([env_ra[x] for x in env_ra.dtype.names], dtype=float).T # Apply transformation if desired if self.species_transform == 'SQRT': spp = np.sqrt(spp) elif self.species_transform == 'LOG': spp = np.log(spp) # Create the RDA object cca = numpy_ordination.NumpyRDA(spp, env) # Open the output file numpy_fh = open(self.ord_file, 'w') # Eigenvalues numpy_fh.write('### Eigenvalues ###\n') for (i, e) in enumerate(cca.eigenvalues): numpy_fh.write('RDA' + str(i + 1) + ',' + '%.10f' % e + '\n') numpy_fh.write('\n') # Print out variable means numpy_fh.write('### Variable Means ###\n') for (i, m) in enumerate(cca.env_means): numpy_fh.write('%s,%.10f\n' % (self.variables[i], m)) numpy_fh.write('\n') # Print out environmental coefficients loadings numpy_fh.write('### Coefficient Loadings ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, c) in enumerate(cca.coefficients()): coeff = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (self.variables[i], coeff)) numpy_fh.write('\n') # Print out biplot scores numpy_fh.write('### Biplot Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('VARIABLE,' + header_str + '\n') for (i, b) in enumerate(cca.biplot_scores()): scores = ','.join(['%.10f' % x for x in b]) numpy_fh.write('%s,%s\n' % (self.variables[i], scores)) numpy_fh.write('\n') # Print out species centroids numpy_fh.write('### Species Centroids ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, c) in enumerate(cca.species_centroids()): scores = ','.join(['%.10f' % x for x in c]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out species tolerances numpy_fh.write('### Species Tolerances ###\n') header_str = \ ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('SPECIES,' + header_str + '\n') for (i, t) in enumerate(cca.species_tolerances()): scores = ','.join(['%.21f' % x for x in t]) numpy_fh.write('%s,%s\n' % (spp_ra.dtype.names[i], scores)) numpy_fh.write('\n') # Print out miscellaneous species information numpy_fh.write('### Miscellaneous Species Information ###\n') numpy_fh.write('SPECIES,WEIGHT,N2\n') species_weights, species_n2 = cca.species_information() for i in xrange(len(species_weights)): numpy_fh.write( '%s,%.10f,%.10f\n' % (spp_ra.dtype.names[i], species_weights[i], species_n2[i])) numpy_fh.write('\n') # Print out site LC scores numpy_fh.write('### Site LC Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_lc_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Print out site WA scores numpy_fh.write('### Site WA Scores ###\n') header_str = ','.join(['RDA%d' % (i + 1) for i in xrange(cca.rank)]) numpy_fh.write('ID,' + header_str + '\n') for (i, s) in enumerate(cca.site_wa_scores()): scores = ','.join(['%.10f' % x for x in s]) numpy_fh.write('%d,%s\n' % (spp_plot_ids[i], scores)) numpy_fh.write('\n') # Miscellaneous site information numpy_fh.write('### Miscellaneous Site Information ###\n') numpy_fh.write('ID,WEIGHT,N2\n') site_weights, site_n2 = cca.site_information() for i in xrange(len(site_weights)): numpy_fh.write('%s,%.10f,%.10f\n' % (spp_plot_ids[i], site_weights[i], site_n2[i])) # Close the file numpy_fh.close()
def fit_sections(im, psf, nx, ny, overlap=50, weight=None, dq=None, blist=None, **kw): bdx = numpy.round(numpy.linspace(0, im.shape[0], nx + 1)).astype('i4') bdlx = numpy.clip(bdx - overlap, 0, im.shape[0]) bdrx = numpy.clip(bdx + overlap, 0, im.shape[0]) bdy = numpy.round(numpy.linspace(0, im.shape[1], ny + 1)).astype('i4') bdly = numpy.clip(bdy - overlap, 0, im.shape[1]) bdry = numpy.clip(bdy + overlap, 0, im.shape[1]) modelim = numpy.zeros_like(im) skyim = numpy.zeros_like(im) prisofar = numpy.zeros_like(im, dtype='bool') # this holder for stars gets filled out more completely later after # the first fit; for the moment, we just want the critical fields to # exist stars = numpy.zeros(0, dtype=[('x', 'f4'), ('y', 'f4'), ('flux', 'f4'), ('primary', 'i4'), ('psf', 'i4')]) t0 = time.time() if kw.get('verbose', False): print('Starting new CCD at %s' % time.ctime()) sys.stdout.flush() psfs = [] for i in range(nx): for j in range(ny): sall = numpy.s_[bdlx[i]:bdrx[i + 1], bdly[j]:bdry[j + 1]] spri = numpy.s_[bdx[i]:bdx[i + 1], bdy[j]:bdy[j + 1]] dx, dy = (bdrx[i + 1] - bdlx[i], bdry[j + 1] - bdly[j]) sfit = numpy.s_[bdx[i] - bdlx[i]:dx + bdx[i + 1] - bdrx[i + 1], bdy[j] - bdly[j]:dy + bdy[j + 1] - bdry[j + 1]] mfixed = in_bounds(stars['x'], stars['y'], [bdlx[i] - 0.5, bdrx[i + 1] - 0.5], [bdly[j] - 0.5, bdry[j + 1] - 0.5]) ol2 = overlap / 2 mfixed &= ~in_bounds(stars['x'], stars['y'], [bdx[i] - 0.5 - ol2, bdx[i + 1] - 0.5 + ol2], [bdy[j] - 0.5 - ol2, bdy[j + 1] - 0.5 + ol2]) xp, yp = (numpy.round(c).astype('i4') for c in (stars['x'], stars['y'])) mfixed &= (stars['primary'] == 1) | (prisofar[xp, yp] == 0) fixedstars = {f: stars[f][mfixed] for f in stars.dtype.names} fixedstars['x'] -= bdlx[i] fixedstars['y'] -= bdly[j] fixedstars['psfob'] = psfs fixedstars['offset'] = (bdlx[i], bdly[j]) if (i == 0) and (j == 0): tpsf = psf elif j != 0: tpsf = psfs[-1] else: tpsf = psfs[-ny] if blist is not None: # cut to bright stars in subimage mb = ((blist[0] >= bdlx[i]) & (blist[0] <= bdrx[i + 1]) & (blist[1] >= bdly[j]) & (blist[1] <= bdry[j + 1])) blist0 = [ blist[0][mb] - bdlx[i], blist[1][mb] - bdly[j], blist[2][mb] ] # offset X & Y to new positions else: blist0 = None res0 = crowdsource_base.fit_im(im[sall].copy(), tpsf, weight=weight[sall].copy(), dq=dq[sall].copy(), fixedstars=fixedstars, blist=blist0, **kw) newstars, skypar0, model0, sky0, psf0 = res0 newstars['x'] += bdlx[i] newstars['y'] += bdly[j] primary0 = in_bounds(newstars['x'], newstars['y'], [bdx[i] - 0.5, bdx[i + 1] - 0.5], [bdy[j] - 0.5, bdy[j + 1] - 0.5]) newstars['primary'] = primary0 newstars['psf'] = (numpy.ones(len(newstars['x']), dtype='i4') * len(psfs)) dtypenames = newstars.keys() dtypeformats = [newstars[n].dtype for n in dtypenames] dtype = dict(names=dtypenames, formats=dtypeformats) newstars = numpy.fromiter(zip(*newstars.itervalues()), dtype=dtype, count=len(newstars['x'])) stars = (newstars if len(stars) == 0 else numpy.append( stars, newstars)) psf0.offset = (bdlx[i], bdly[j]) psfs.append(psf0) modelim[spri] = model0[sfit] skyim[spri] = sky0[sfit] prisofar[spri] = sky0[sfit] if kw.get('verbose', False): t1 = time.time() print('Fit tile (%d, %d) of (%d, %d); %d sec elapsed' % (i + 1, j + 1, nx, ny, t1 - t0)) t0 = t1 # import csplot sys.stdout.flush() stars = stars[stars['primary'] == 1] from matplotlib.mlab import rec_drop_fields stars = rec_drop_fields(stars, ['primary']) return stars, modelim, skyim, psfs
def classify_calls(aml_clean_path, class_path, param_dict): '''Classify calls by species and save files.''' # Get dirs output_dir, tail = os.path.split(aml_clean_path) class_dir, tail = os.path.split(class_path) prefix = tail.split('.')[0] # Load spp_names as list spp_path = os.path.join(class_dir, prefix + '_spp_codes.csv') spp_names = list(csv2rec(spp_path).spp) spp_names_comma = ''.join([x + ',' for x in spp_names])[:-1] # Load classifier from pickle clf = joblib.load(class_path) # Load aml_clean as recarray table = csv2rec(aml_clean_path) # Only use calls with qual < maxqual table = table[table.qual < float(param_dict['maxqual'])] # Save path, folder, call, and qual fields for later path = table.path folder = table.folder call = table.filename qual = table.qual # Remove non-feature cols from table table = rec_drop_fields(table, ['path', 'folder', 'filename', 'st', 'dc', 'qual', 'pmc']) # Recarray to ndarray, since classifier required ndarray X = table.view((float, len(table.dtype.names))) # Predict probabilities for each call pred = clf.predict_proba(X) # Save call_prob and call_bin files header = 'path,folder,pass,qual,' + spp_names_comma file_callpr = open(os.path.join(output_dir, 'call_prob.csv'), 'w') file_callbi = open(os.path.join(output_dir, 'call_bin.csv'), 'w') file_callpr.write(header + '\n') file_callbi.write(header + '\n') for row in xrange(0, len(call)): # For all calls row_comma_prob = ''.join([str(x)+',' for x in pred[row]])[:-1] row_bin = (pred[row] == pred[row].max()) + 0 # +0 makes int not bool row_comma_bin = ''.join([str(x)+',' for x in row_bin])[:-1] file_callpr.write(path[row] + ',' + folder[row] + ',' + call[row] + ',' + str(qual[row]) + ',' + row_comma_prob + '\n') file_callbi.write(path[row] + ',' + folder[row] + ',' + call[row] + ',' + str(qual[row]) + ',' + row_comma_bin + '\n') file_callpr.close() file_callbi.close() # Get array of unique filenames (ie, passes, each may contain many calls) passes = np.unique(call) # Set up pass files for writing header = 'path,folder,pass,ncalls,' + spp_names_comma file_passpr = open(os.path.join(output_dir, 'pass_prob.csv'), 'w') file_passmaxpr = open(os.path.join(output_dir, 'pass_maxprob.csv'), 'w') file_passbi = open(os.path.join(output_dir, 'pass_bin.csv'), 'w') file_passpr.write(header + '\n') file_passmaxpr.write(header + '\n') file_passbi.write(header + '\n') # Loop through passes for this_pass in passes: # Get boolean locations in table of calls associated with this pass these_calls = (call == this_pass) first_call = np.argmax(these_calls) # Row of first call # Take subset of pred corresponding to calls in pass these_preds = pred[these_calls] # Get descriptor variables for this pass this_path = path[first_call] this_folder = folder[first_call] this_ncalls = np.shape(these_preds)[0] # Get summed probability for each species if these_preds.shape[0] == 1: # If only one call pass_prob = these_preds / this_ncalls pass_prob = pass_prob.flatten() else: pass_prob = np.sum(these_preds, 0) / this_ncalls # Find the species with the maximum prob pass_maxprob = (pass_prob == pass_prob.max()) + 0 # Find all calls with prob greater than minprob, cast to int minprob_calls = (these_preds > float(param_dict['minprob'])) + 0 # Count number of calls for each species that meet minprob num_minprob_calls = np.sum(minprob_calls, 0) # Find all species with sufficient calls to meet mincalls pass_bin = (num_minprob_calls >= float(param_dict['mincalls'])) + 0 # Write files row_comma_prob = ''.join([str(x)+',' for x in pass_prob])[:-1] file_passpr.write(this_path + ',' + this_folder + ',' + this_pass + ',' + str(this_ncalls) + ',' + row_comma_prob + '\n') row_comma_maxprob = ''.join([str(x)+',' for x in pass_maxprob])[:-1] file_passmaxpr.write(this_path + ',' + this_folder + ',' + this_pass + ',' + str(this_ncalls) + ',' + row_comma_maxprob + '\n') row_comma_bin = ''.join([str(x)+',' for x in pass_bin])[:-1] file_passbi.write(this_path + ',' + this_folder + ',' + this_pass + ',' + str(this_ncalls) + ',' + row_comma_bin + '\n') # Close pass files file_passpr.close() file_passmaxpr.close() file_passbi.close()