def rec2csv(rec_array, csv_file, formatd=None, **kwargs): """ Convenience wrapper function on top of mlab.rec2csv to allow fixed- precision output to CSV files Parameters ---------- rec_aray : numpy 1-d recarray The recarray to be written out csv_file : str CSV file name kwargs : dict Keyword arguments to pass through to mlab.rec2csv Returns ------- None """ # Get the formatd objects associated with each field formatd = mlab.get_formatd(rec_array, formatd) # For all FormatFloat objects, switch to FormatDecimal objects for (k, v) in formatd.iteritems(): if isinstance(v, mlab.FormatFloat): formatd[k] = FormatDecimal() # Pass this specification to mlab.rec2csv mlab.rec2csv(rec_array, csv_file, formatd=formatd, **kwargs)
def otherfunc(roifiles, subjects): import numpy as np from matplotlib.mlab import rec2csv import os first = np.recfromcsv(roifiles[0]) numcons = len(first.dtype.names) - 1 roinames = ["subject_id"] + first["roi"].tolist() formats = ['a20'] + ['f4' for f in roinames[1:]] confiles = [] for con in range(0, numcons): recarray = np.zeros(len(roifiles), dtype={ 'names': roinames, "formats": formats }) for i, file in enumerate(roifiles): recfile = np.recfromcsv(file) recarray["subject_id"][i] = subjects[i] for roi in roinames[1:]: value = recfile["con%02d" % (con + 1)][recfile['roi'] == roi] if value: recarray[roi][i] = value else: recarray[roi][i] = 999 filename = os.path.abspath("grouped_con%02d.csv" % (con + 1)) rec2csv(recarray, filename) confiles.append(filename) return confiles
def make_csv(self, out_csv, array): if out_csv is None: return 0 else: print "Generating csv" mlab.rec2csv(array, out_csv) return 1
def testR(d=simple(), size=500): X = random_from_categorical_formula(d, size) X = ML.rec_append_fields(X, 'response', np.random.standard_normal(size)) fname = tempfile.mktemp() ML.rec2csv(X, fname) Rstr = ''' data = read.table("%s", sep=',', header=T) cur.lm = lm(response ~ %s, data) COEF = coef(cur.lm) ''' % (fname, d.Rstr) rpy2.robjects.r(Rstr) remove(fname) nR = list(np.array(rpy2.robjects.r("names(COEF)"))) nt.assert_true('(Intercept)' in nR) nR.remove("(Intercept)") nF = [str(t).replace("_", "").replace("*", ":") for t in d.formula.terms] nR = sorted([sorted(n.split(":")) for n in nR]) nt.assert_true('1' in nF) nF.remove('1') nF = sorted([sorted(n.split(":")) for n in nF]) nt.assert_equal(nR, nF) return d, X, nR, nF
def test_csv2rec_roundtrip(self): delta = datetime.timedelta(days=1) date0 = datetime.date(2007,12,16) date1 = date0 + delta date2 = date1 + delta delta = datetime.timedelta(days=1) datetime0 = datetime.datetime(2007,12,16,22,29,34,924122) datetime1 = datetime0 + delta datetime2 = datetime1 + delta ra=numpy.rec.fromrecords([ (123, date0, datetime0, 1197346475.0137341, 'a,bc'), (456, date1, datetime1, 123.456, 'd\'ef'), (789, date2, datetime2, 0.000000001, 'ghi'), ], names='intdata,datedata,datetimedata,floatdata,stringdata') fh = StringIO.StringIO() mlab.rec2csv( ra, fh ) fh.seek(0) if 0: print 'CSV contents:','-'*40 print fh.read() print '-'*40 fh.seek(0) ra2 = mlab.csv2rec(fh) fh.close() for name in ra.dtype.names: if 0: print name, repr(ra[name]), repr(ra2[name]) dt = ra.dtype[name] print 'repr(dt.type)',repr(dt.type) self.failUnless( numpy.all(ra[name] == ra2[name]) ) # should not fail with numpy 1.0.5
def otherfunc(roifiles, subjects): import numpy as np from matplotlib.mlab import rec2csv import os first = np.recfromcsv(roifiles[0]) numcons = len(first.dtype.names) - 1 roinames = ["subject_id"] + first["roi"].tolist() formats = ["a20"] + ["f4" for f in roinames[1:]] confiles = [] for con in range(0, numcons): recarray = np.zeros(len(roifiles), dtype={"names": roinames, "formats": formats}) for i, file in enumerate(roifiles): recfile = np.recfromcsv(file) recarray["subject_id"][i] = subjects[i] for roi in roinames[1:]: value = recfile["con%02d" % (con + 1)][recfile["roi"] == roi] if value: recarray[roi][i] = value else: recarray[roi][i] = 999 filename = os.path.abspath("grouped_con%02d.csv" % (con + 1)) rec2csv(recarray, filename) confiles.append(filename) return confiles
def testR(d=simple(), size=500): X = random_from_categorical_formula(d, size) X = ML.rec_append_fields(X, 'response', np.random.standard_normal(size)) fname = tempfile.mktemp() ML.rec2csv(X, fname) Rstr = ''' data = read.table("%s", sep=',', header=T) cur.lm = lm(response ~ %s, data) COEF = coef(cur.lm) ''' % (fname, d.Rstr) rpy2.robjects.r(Rstr) remove(fname) nR = list(np.array(rpy2.robjects.r("names(COEF)"))) nt.assert_true('(Intercept)' in nR) nR.remove("(Intercept)") nF = [str(t).replace("_","").replace("*",":") for t in d.formula.terms] nR = sorted([sorted(n.split(":")) for n in nR]) nt.assert_true('1' in nF) nF.remove('1') nF = sorted([sorted(n.split(":")) for n in nF]) nt.assert_equal(nR, nF) return d, X, nR, nF
def test_csv2rec_closefile(self): # If passed a file-like object, rec2csv should not close it. ra=numpy.rec.array([(123, 1197346475.0137341), (456, 123.456)], dtype=[('a', '<i8'), ('b', '<f8')]) fh = StringIO.StringIO() mlab.rec2csv( ra, fh ) self.failIf( fh.closed )
def makediffs(models = _allmodels, verbose = False, kpp = True): for model in models: model = os.path.splitext(os.path.basename(model))[0] if kpp: kppdat = csv2rec(os.path.join(model, model + '.dat'), delimiter = ' ') else: if model not in _modelconfigs: raise IOError('If KPP is not properly installed, you cannot run tests on mechanisms other than cbm4, saprc99, and small_strato.') kppdat = csv2rec(os.path.join(os.path.dirname(__file__), model + '.dat'), delimiter = ' ') pykppdat = csv2rec(os.path.join(model, model + '.pykpp.dat'), delimiter = ',') diff = pykppdat.copy() pct = pykppdat.copy() keys = set(kppdat.dtype.names).intersection(pykppdat.dtype.names) notkeys = set(pykppdat.dtype.names).difference(kppdat.dtype.names) notkeys.remove('t') for k in notkeys: diff[k] = np.nan pct[k] = np.nan for k in keys: diff[k] = pykppdat[k] - kppdat[k][:] pct[k] = diff[k] / kppdat[k][:] * 100 diff['t'] = pykppdat['t'] - (kppdat['time'] * 3600. + pykppdat['t'][0]) pct['t'] = diff['t'] / (kppdat['time'] * 3600. + pykppdat['t'][0]) * 100 rec2csv(diff, os.path.join(model, model + '.diff.csv'), delimiter = ',') rec2csv(pct, os.path.join(model, model + '.pct.csv'), delimiter = ',')
def main(): print "initializing" ap.env.overwriteOutput = True ap.env.workspace = WORKSPACE ras = ["marginal_ag_land_ha", "favored_ag_land_ha", "ag_wateronly_constrained_ha", "ag_landonly_constrained_ha", "ag_both_constrained_ha"] lbls = ["mar_ha","fav_ha","water_ha","land_ha","both_ha"] ap.CheckOutExtension("SPATIAL") POLYS = "mena_plus" POLYFIELD = "name" recs = [] for i in range(len(ras)): ap.sa.ZonalStatisticsAsTable(POLYS,POLYFIELD,ras[i],lbls[i],"DATA","SUM") recs.append(ap.da.TableToNumPyArray(lbls[i],[POLYFIELD,"SUM"])) outrecs = [recs[i]["SUM"] for i in range(len(recs))] outrecs.extend([recs[i][POLYFIELD] for i in range(len(recs))]) mlab.rec2csv(np.rec.fromarrays(outrecs, names=lbls),OUTCSV) print "complete"
def rewrite_spec(subj, run, root = "/home/jtaylo/FIAC-HBM2009"): """ Take a FIAC specification file and get two specifications (experiment, begin). This creates two new .csv files, one for the experimental conditions, the other for the "initial" confounding trials that are to be modelled out. For the block design, the "initial" trials are the first trials of each block. For the event designs, the "initial" trials are made up of just the first trial. """ if exists(pjoin("%(root)s", "fiac%(subj)d", "subj%(subj)d_evt_fonc%(run)d.txt") % {'root':root, 'subj':subj, 'run':run}): designtype = 'evt' else: designtype = 'bloc' # Fix the format of the specification so it is # more in the form of a 2-way ANOVA eventdict = {1:'SSt_SSp', 2:'SSt_DSp', 3:'DSt_SSp', 4:'DSt_DSp'} s = StringIO() w = csv.writer(s) w.writerow(['time', 'sentence', 'speaker']) specfile = pjoin("%(root)s", "fiac%(subj)d", "subj%(subj)d_%(design)s_fonc%(run)d.txt") % {'root':root, 'subj':subj, 'run':run, 'design':designtype} d = np.loadtxt(specfile) for row in d: w.writerow([row[0]] + eventdict[row[1]].split('_')) s.seek(0) d = csv2rec(s) # Now, take care of the 'begin' event # This is due to the FIAC design if designtype == 'evt': b = np.array([(d[0]['time'], 1)], np.dtype([('time', np.float), ('initial', np.int)])) d = d[1:] else: k = np.equal(np.arange(d.shape[0]) % 6, 0) b = np.array([(tt, 1) for tt in d[k]['time']], np.dtype([('time', np.float), ('initial', np.int)])) d = d[~k] designtype = {'bloc':'block', 'evt':'event'}[designtype] fname = pjoin(DATADIR, "fiac_%(subj)02d", "%(design)s", "experiment_%(run)02d.csv") % {'root':root, 'subj':subj, 'run':run, 'design':designtype} rec2csv(d, fname) experiment = csv2rec(fname) fname = pjoin(DATADIR, "fiac_%(subj)02d", "%(design)s", "initial_%(run)02d.csv") % {'root':root, 'subj':subj, 'run':run, 'design':designtype} rec2csv(b, fname) initial = csv2rec(fname) return d, b
def to_file(self, filename, **kwargs): """ Saves results to file, which will be gzipped if `filename` has a .gz extension. kwargs are passed to matplotlib.mlab.rec2csv """ rec2csv(self.data, filename, **kwargs)
def convert(infilename, outfilename, ): results = tables.open_file(infilename,mode='r') ra = results.root.textlog[:] results.close() mlab.rec2csv( ra, outfilename)
def test_rec2csv_bad_shape(): try: bad = np.recarray((99,4),[('x',np.float),('y',np.float)]) fd = tempfile.TemporaryFile(suffix='csv') # the bad recarray should trigger a ValueError for having ndim > 1. mlab.rec2csv(bad,fd) finally: fd.close()
def test_rec2csv_bad_shape(): try: bad = np.recarray((99, 4), [('x', np.float), ('y', np.float)]) fd = tempfile.TemporaryFile(suffix='csv') # the bad recarray should trigger a ValueError for having ndim > 1. mlab.rec2csv(bad, fd) finally: fd.close()
def write_results_to_csv(results, directory): experiments, outcomes = results # deceased_pop = outcomes['relative market price'] # time = outcomes[TIME] rec2csv(experiments, directory+'/experiments.csv', withheader=True) for key, value in outcomes.iteritems(): np.savetxt(directory+'/{}.csv'.format(key), value, delimiter=',')
def interesting_out(opts,interesting,data): """ Take a list of fields, and the recs output recs as csv to opts["out"], e.g. --out """ header = True from matplotlib import mlab for d in data: cleaned = mlab.rec_keep_fields(d,interesting) mlab.rec2csv(cleaned,opts["out"],withheader=header) header=False
def interesting_out(opts, interesting, data): """ Take a list of fields, and the recs output recs as csv to opts["out"], e.g. --out """ header = True from matplotlib import mlab for d in data: cleaned = mlab.rec_keep_fields(d, interesting) mlab.rec2csv(cleaned, opts["out"], withheader=header) header = False
def main(): inputlist = ["bin/global_BWS_20121015.csv","bin/global_WRI_20121015.csv"] lhs = mlab.csv2rec("bin/global_GU_20121015.csv") rhslist = [] for x in inputlist: rhslist.append(mlab.csv2rec(x)) rhslist[0]["basinid"] = rhslist[0]["basinid"].astype(np.long) keys = ("basinid","countryid","id") lhs = join_recs_on_keys(lhs,rhslist,keys) mlab.rec2csv(lhs,"bin/test.csv") print "complete"
def main(): print "initializing" ap.env.overwriteOutput = True #"World_Cylindrical_Equal_Area" sr = ap.SpatialReference(54034) ap.Project_management(BASINPOLY, TMP_OUT, sr) ap.CalculateAreas_stats(TMP_OUT,TMP_OUT2) out = ap.da.FeatureClassToNumPyArray(TMP_OUT2,[BASIN_ID_FIELD,"F_AREA"]) mlab.rec2csv(out,AREACSV) print "complete"
def test_recarray_csv_roundtrip(): expected = np.recarray((99, ), [('x', np.float), ('y', np.float), ('t', np.float)]) expected['x'][0] = 1 expected['y'][1] = 2 expected['t'][2] = 3 fd = tempfile.TemporaryFile(suffix='csv') mlab.rec2csv(expected, fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose(expected['x'], actual['x']) assert np.allclose(expected['y'], actual['y']) assert np.allclose(expected['t'], actual['t'])
def test_recarray_csv_roundtrip(): expected = np.recarray((99, ), [('x', np.float), ('y', np.float), ('t', np.float)]) expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) fd = tempfile.TemporaryFile(suffix='csv') mlab.rec2csv(expected, fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose(expected['x'], actual['x']) assert np.allclose(expected['y'], actual['y']) assert np.allclose(expected['t'], actual['t'])
def test_recarray_csv_roundtrip(): expected = np.recarray((99,), [('x',np.float),('y',np.float),('t',np.float)]) expected['x'][0] = 1 expected['y'][1] = 2 expected['t'][2] = 3 fd = tempfile.TemporaryFile(suffix='csv') mlab.rec2csv(expected,fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose( expected['x'], actual['x'] ) assert np.allclose( expected['y'], actual['y'] ) assert np.allclose( expected['t'], actual['t'] )
def test_recarray_csv_roundtrip(): expected = np.recarray((99,), [('x',np.float),('y',np.float),('t',np.float)]) expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) fd = tempfile.TemporaryFile(suffix='csv') mlab.rec2csv(expected,fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose( expected['x'], actual['x'] ) assert np.allclose( expected['y'], actual['y'] ) assert np.allclose( expected['t'], actual['t'] )
def __call__(self, *args, **kwargs): """Load requested dataset, downloading it if needed or requested. For test purpose, instead of actually fetching the dataset, this function creates empty files and return their paths. """ kwargs['mock'] = True files = original_fetch_files(*args, **kwargs) # Fill CSV files with given content if needed for f in files: basename = os.path.basename(f) if basename in self.csv_files: array = self.csv_files[basename] rec2csv(array, f) return files
def write_results_to_csv(results, directory): experiments, outcomes = results # deceased_pop = outcomes['relative market price'] # time = outcomes[TIME] rec2csv(experiments, directory+'/experiments.csv', withheader=True) for key, value in outcomes.iteritems(): np.savetxt(directory+'/{}.csv'.format(key), value, delimiter=',') # np.savetxt('./data/scarcity/relative_market_price.csv', deceased_pop, delimiter=',') # np.savetxt('./data/scarcity/time.csv', time, delimiter=',') # for entry in experiments.dtype.descr: print entry
def main(basin_csv, basin_poly, storage_pts, stor_csv): basin_rec = mlab.csv2rec(basin_csv) ids = basin_rec["basinid"] d_ids = basin_rec["dwnbasinid"] ap.Identity_analysis(storage_pts, basin_poly, TMP_OUT, "NO_FID") out = ap.da.FeatureClassToNumPyArray(TMP_OUT,[STOR_FIELD,BASIN_ID_FIELD]) stor = np.array([np.sum(out[STOR_FIELD][out[BASIN_ID_FIELD]==i]) for i in ids]) fa_stor = fa.accumulate(ids,d_ids,f0,f,stor) outrec = np.rec.fromarrays((ids,stor,fa_stor),names=("basinid","stor","fa_stor")) mlab.rec2csv(outrec, stor_csv)
def test_recarray_csv_roundtrip(self): expected = np.recarray((99, ), [('x', np.float), ('y', np.float), ('t', np.float)]) # initialising all values: uninitialised memory sometimes produces # floats that do not round-trip to string and back. expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) mlab.rec2csv(expected, self.fd) self.fd.seek(0) actual = mlab.csv2rec(self.fd) np.testing.assert_allclose(expected['x'], actual['x']) np.testing.assert_allclose(expected['y'], actual['y']) np.testing.assert_allclose(expected['t'], actual['t'])
def test_recarray_csv_roundtrip(): expected = np.recarray((99,), [('x',np.float),('y',np.float),('t',np.float)]) # initialising all values: uninitialised memory sometimes produces floats # that do not round-trip to string and back. expected['x'] = np.linspace(0,1e-200,99) expected['y'] = np.linspace(0,1,99) expected['t'] = np.linspace(0,1e300,99) fd = tempfile.TemporaryFile(suffix='csv', mode="w+") mlab.rec2csv(expected,fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose( expected['x'], actual['x'] ) assert np.allclose( expected['y'], actual['y'] ) assert np.allclose( expected['t'], actual['t'] )
def test_recarray_csv_roundtrip(): expected = np.recarray((99, ), [('x', np.float), ('y', np.float), ('t', np.float)]) # initialising all values: uninitialised memory sometimes produces floats # that do not round-trip to string and back. expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) fd = tempfile.TemporaryFile(suffix='csv', mode="w+") mlab.rec2csv(expected, fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose(expected['x'], actual['x']) assert np.allclose(expected['y'], actual['y']) assert np.allclose(expected['t'], actual['t'])
def write_results_to_csv(results, directory): experiments, outcomes = results # deceased_pop = outcomes['relative market price'] # time = outcomes[TIME] rec2csv(experiments, directory + '/x.csv', withheader=True) for key, value in outcomes.iteritems(): np.savetxt(directory + '/{}.csv'.format(key), value, delimiter=',') # np.savetxt('./data/scarcity/relative_market_price.csv', deceased_pop, delimiter=',') # np.savetxt('./data/scarcity/time.csv', time, delimiter=',') # for entry in x.dtype.descr: print entry
def getrange(date, num): daily_arrays = [] for day in range(num): string_date = date.strftime('%Y%m%d') daily_arrays.append(fetchday(string_date)) date += datetime.timedelta(1) full_range = concatenate(daily_arrays, axis=1) filename = string_date+'+'+str(num)+'.csv' mlab.rec2csv(full_range, filename) print 'saved as ', filename return full_range
def test(): """Test script""" import matplotlib.mlab as mlab import time import gen_merge BASINCSV = r"C:\Users\francis.gassert\Documents\ArcGIS\GISSync\global_maps\basins_15006.csv" BASINID = "basinid" DWNBASIN = "dwnbasinid" OUTCSV = r"C:\Users\francis.gassert\Documents\ArcGIS\GISSync\global_maps\bt_test.csv" runoffcsv = r"C:\Users\francis.gassert\Documents\ArcGIS\GISSync\global_maps\global-GLDAS-2.0_Noah-3.3_M.020-20121211-filled-20130821-RO.csv" basin_arr = mlab.csv2rec(BASINCSV) ids = basin_arr[BASINID] d_ids = basin_arr[DWNBASIN] r_arr = mlab.csv2rec(runoffcsv) r = r_arr["2010"] assert np.all(r_arr[BASINID]==ids) def f0( i, r ): return r[i] def f( i, idx, values, *args ): return np.sum(values[idx]) + f0(i, *args) time.clock() #id_dict = dict(zip(ids, upstream_ids(ids, d_ids))) #r2 = gen_merge.arrange_vector_by_ids(r, ids, np.arange(len(ids)+1)) #out1 = np.array([np.sum(r2[id_dict[i]])+r2[i] for i in ids]) #t1 = time.clock() out2 = accumulate(ids, d_ids, f0, f, r) t2 = time.clock() btcsv = r"C:\Users\francis.gassert\Documents\ArcGIS\GISSync\global_maps\global-GLDAS-2.0_Noah-3.3_M.020-20121211-filled-20130821-Bt.csv" bt_arr = mlab.csv2rec(btcsv) bt = bt_arr["2010"] #print ("time1: %s" % t1) print ("time2: %s" % t2) #print ("error1: %s " % (np.sum(out1-bt)/np.sum(bt)) ) print ("error2: %s " % (np.sum(out2-bt)/np.sum(bt)) ) outrec2 = np.rec.fromarrays((ids,out2),names=(BASINID,"2010")) mlab.rec2csv(outrec2,OUTCSV)
def test_csv2rec_roundtrip(self): # Make sure double-precision floats and strings pass through a # roundtrip unaltered. # A bug in numpy (fixed in r4602) meant that numpy scalars # lost precision when passing through repr(). csv2rec was # affected by this. This test will only pass on numpy >= # 1.0.5. delta = datetime.timedelta(days=1) date0 = datetime.date(2007, 12, 16) date1 = date0 + delta date2 = date1 + delta delta = datetime.timedelta(days=1) datetime0 = datetime.datetime(2007, 12, 16, 22, 29, 34, 924122) datetime1 = datetime0 + delta datetime2 = datetime1 + delta ra = numpy.rec.fromrecords( [ (123, date0, datetime0, 1197346475.0137341, 'a,bc'), (456, date1, datetime1, 123.456, 'd\'ef'), (789, date2, datetime2, 0.000000001, 'ghi'), ], names='intdata,datedata,datetimedata,floatdata,stringdata') fh = StringIO.StringIO() mlab.rec2csv(ra, fh) fh.seek(0) if 0: print('CSV contents:', '-' * 40) print(fh.read()) print('-' * 40) fh.seek(0) ra2 = mlab.csv2rec(fh) fh.close() #print 'ra', ra #print 'ra2', ra2 for name in ra.dtype.names: if 0: print(name, repr(ra[name]), repr(ra2[name])) dt = ra.dtype[name] print('repr(dt.type)', repr(dt.type)) self.failUnless(numpy.all( ra[name] == ra2[name])) # should not fail with numpy 1.0.5
def main(): print "initializing" ap.env.overwriteOutput = True ap.env.workspace = WORKSPACE print "copying" ap.CopyFeatures_management(INFEATURES,OUTFEATURES) print "joining" arr = mlab.csv2rec(INCSV) ap.da.ExtendTable(OUTFEATURES,JOIN_FIELD_SHP,arr,JOIN_FIELD_CSV) print "saving" arr = ap.da.TableToNumPyArray(OUTFEATURES,"*") mlab.rec2csv(arr,OUTCSV) print "complete"
def test_csv2rec_masks(self): csv = """date,age,weight,name 2007-01-01,12,32.2,"jdh1" 0000-00-00,0,23,"jdh2" 2007-01-03,,32.5,"jdh3" 2007-01-04,12,NaN,"jdh4" 2007-01-05,-1,NULL,""" missingd = dict(date='0000-00-00', age='-1', weight='NULL') fh = StringIO.StringIO(csv) r1 = mlab.csv2rec(fh, missingd=missingd) fh = StringIO.StringIO() mlab.rec2csv(r1, fh, missingd=missingd) fh.seek(0) r2 = mlab.csv2rec(fh, missingd=missingd) self.failUnless( numpy.all( r2['date'].mask == [0,1,0,0,0] )) self.failUnless( numpy.all( r2['age'].mask == [0,0,1,0,1] )) self.failUnless( numpy.all( r2['weight'].mask == [0,0,0,0,1] )) self.failUnless( numpy.all( r2['name'].mask == [0,0,0,0,1] ))
def test_recarray_csv_roundtrip(self): expected = np.recarray((99,), [(str('x'), np.float), (str('y'), np.float), (str('t'), np.float)]) # initialising all values: uninitialised memory sometimes produces # floats that do not round-trip to string and back. expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) mlab.rec2csv(expected, self.fd) self.fd.seek(0) actual = mlab.csv2rec(self.fd) np.testing.assert_allclose(expected['x'], actual['x']) np.testing.assert_allclose(expected['y'], actual['y']) np.testing.assert_allclose(expected['t'], actual['t'])
def test_csv2rec_roundtrip(self): # Make sure double-precision floats and strings pass through a # roundtrip unaltered. # A bug in numpy (fixed in r4602) meant that numpy scalars # lost precision when passing through repr(). csv2rec was # affected by this. This test will only pass on numpy >= # 1.0.5. delta = datetime.timedelta(days=1) date0 = datetime.date(2007,12,16) date1 = date0 + delta date2 = date1 + delta delta = datetime.timedelta(days=1) datetime0 = datetime.datetime(2007,12,16,22,29,34,924122) datetime1 = datetime0 + delta datetime2 = datetime1 + delta ra=numpy.rec.fromrecords([ (123, date0, datetime0, 1197346475.0137341, 'a,bc'), (456, date1, datetime1, 123.456, 'd\'ef'), (789, date2, datetime2, 0.000000001, 'ghi'), ], names='intdata,datedata,datetimedata,floatdata,stringdata') fh = StringIO.StringIO() mlab.rec2csv( ra, fh ) fh.seek(0) if 0: print 'CSV contents:','-'*40 print fh.read() print '-'*40 fh.seek(0) ra2 = mlab.csv2rec(fh) fh.close() #print 'ra', ra #print 'ra2', ra2 for name in ra.dtype.names: if 0: print name, repr(ra[name]), repr(ra2[name]) dt = ra.dtype[name] print 'repr(dt.type)',repr(dt.type) self.failUnless( numpy.all(ra[name] == ra2[name]) ) # should not fail with numpy 1.0.5
def test_recarray_csv_roundtrip(): expected = np.recarray((99,), [('x',np.float),('y',np.float),('t',np.float)]) # initialising all values: uninitialised memory sometimes produces floats # that do not round-trip to string and back. expected['x'][:] = np.linspace(-1e9, -1, 99) expected['y'][:] = np.linspace(1, 1e9, 99) expected['t'][:] = np.linspace(0, 0.01, 99) if sys.version_info[0] == 2: fd = tempfile.TemporaryFile(suffix='csv', mode="wb+") else: fd = tempfile.TemporaryFile(suffix='csv', mode="w+", newline='') mlab.rec2csv(expected,fd) fd.seek(0) actual = mlab.csv2rec(fd) fd.close() assert np.allclose( expected['x'], actual['x'] ) assert np.allclose( expected['y'], actual['y'] ) assert np.allclose( expected['t'], actual['t'] )
def main(basin_csv, ut_csv, uc_csv, ncons_csv): basin_rec = mlab.csv2rec(basin_csv) uc_rec = mlab.csv2rec(uc_csv) ut_rec = mlab.csv2rec(ut_csv) ids = basin_rec["basinid"] d_ids = basin_rec["dwnbasinid"] uc = gen_merge.arrange_vector_by_ids(uc_rec["ct"],uc_rec["basinid"],ids) ut = gen_merge.arrange_vector_by_ids(ut_rec["ut"],ut_rec["basinid"],ids) unc = ut - uc n = len(ids) ncons = fa.accumulate(ids,d_ids,f0,f,unc) outrec = np.rec.fromarrays((ids,ncons),names=("basinid","ncons")) mlab.rec2csv(outrec, ncons_csv)
def test_csv2rec_masks(self): # Make sure masked entries survive roundtrip csv = """date,age,weight,name 2007-01-01,12,32.2,"jdh1" 0000-00-00,0,23,"jdh2" 2007-01-03,,32.5,"jdh3" 2007-01-04,12,NaN,"jdh4" 2007-01-05,-1,NULL,""" missingd = dict(date='0000-00-00', age='-1', weight='NULL') fh = StringIO.StringIO(csv) r1 = mlab.csv2rec(fh, missingd=missingd) fh = StringIO.StringIO() mlab.rec2csv(r1, fh, missingd=missingd) fh.seek(0) r2 = mlab.csv2rec(fh, missingd=missingd) self.failUnless( numpy.all( r2['date'].mask == [0,1,0,0,0] )) self.failUnless( numpy.all( r2['age'].mask == [0,0,1,0,1] )) self.failUnless( numpy.all( r2['weight'].mask == [0,0,0,0,1] )) self.failUnless( numpy.all( r2['name'].mask == [0,0,0,0,1] ))
def main(): basins = mlab.csv2rec(BASINCSV) dwnbasin = basins["dwnbasinid"].astype(np.long) basinid = basins["basinid"].astype(np.long) dwnbasin[dwnbasin==0]=basinid[dwnbasin==0] numbasins = np.max(basinid) dbid = np.zeros(numbasins+1, dtype=np.long) dbid[basinid] = dwnbasin olddbid = dbid.copy() dbid1 = dbid.copy() dbid = dbid[dbid] while np.sum(olddbid!=dbid): olddbid = dbid.copy() dbid = dbid[dbid] outrec = np.rec.fromarrays([np.arange(0,numbasins+1),dbid,dbid1], names=("BasinID","bigbasin","dbid")) mlab.rec2csv(outrec,"big_basins.csv") ap.CopyFeatures_management(RAWBASINS, BIGBASINS) ap.da.ExtendTable(BIGBASINS, BASIN_ID_FIELD, outrec, BASIN_ID_FIELD)
def complete_data_single(symbol, saveNewFile=False): good_dates = get_data("SPY").date data = get_data(symbol) dates = data.date # First check if the records from the first days are missing # and fill this data with the record found if not (good_dates[0] in dates): # First find the most recent values that is on the data open_val = data[0][1] high = data[0][2] low = data[0][3] close = data[0][4] volume = data[0][5] adj_close = data[0][6] # Then add that record to the beginning until data starts # it is necesary to modify the date i = 0 while not (good_dates[i] in dates): new = (good_dates[i], open_val, high, low, close, volume, adj_close) #n = np.array(most_recent, dtype=data.dtype) data = np.insert(data, i, new, 0) i = i + 1 # TODO: Missing values not onthe beginning if saveNewFile: try: os.remove('./data/%s - old.csv' % symbol) except: pass os.rename('./data/%s.csv' % symbol, './data/%s - old.csv' % symbol) mlab.rec2csv(data, './data/%s.csv' % symbol, delimiter=',') return data
nv_scan_candidates = equal_scans # chose the subject with closest baseline age scan target_age = np.min(age[subj]) candidates_age = [np.min(age[s]) for s in nv_scan_candidates] closest = np.argmin(abs(candidates_age - target_age)) nv_match = nv_scan_candidates[closest] nv_matches.append(nv_match) nv_subjects.remove(nv_match) print 'Matched ADHD %d (%d scans, %s) to NV %d (%d scans, %s).' % ( subj, len(rows[subj]), gf[rows[subj][0]]['sex'], nv_match, len(rows[nv_match]), gf[rows[nv_match][0]]['sex']) break else: target_num_scans -= 1 if not found: rm.append(subj) # remove all subjects for whom we didn't find a match print 'ADHD subjects without matches:', rm for subj in rm: adhd_subjects.remove(subj) # finally, create new variable and output it to a new file match_bool = np.zeros(len(gf)) for subj in (nv_matches + adhd_subjects): match_bool[rows[subj]] = 1 match_bool = mlab.rec_append_fields(gf, var, match_bool) mlab.rec2csv(match_bool, csv_file[:-4] + '_matched_onSex_onNumScan_onBaseAge.csv')
nburn = 100 pos = [THETA + 1e-4*np.random.randn(ndim) for i in range(nwalkers)] sampler = emcee.EnsembleSampler(nwalkers,ndim,lnprob,args=(cat['X'],cat['Y']), threads=nthreads) sampler.run_mcmc(pos,nsamples) samples = sampler.chain[:, nburn:, :].reshape((-1, ndim)) rich,[rich_min,rich_max] = median_interval(samples[:,0]) x,[xmin,xmax] = median_interval(samples[:,1]) y,[ymin,ymax] = median_interval(samples[:,2]) # True centroid x0,y0 = WCS.wcs_world2pix(hdu.header['LON'],hdu.header['LAT'],1) # Results res = [x0,y0,x,xmin,xmax,y,ymin,ymax] results.append(res) if do_plot: fig = corner.corner(samples, labels=["rich", "x", "y"]) fig.savefig("triangle_eri2.png") results = np.rec.array(results,names=['lon','lat','x','xmin','xmax','y','ymin','ymax']) filename='results2_b%i_s%i.csv'%(NBINS,nsamples) print("Writing %s ..."%filename) rec2csv(results,filename)
else: testFiles = glob.glob(dirName + '*.csv') # for each file, if there's no modified version of it, create it for fname in testFiles: if fname.find('modified') < 0: modName = fname[:-4] + '_modified.csv' if not os.path.exists(modName): shutil.copyfile(fname, modName) # for each file in the directory, replace all the occurrences of the first column by the 3rd column for fname in testFiles: # only operate on modified files if fname.find('modified') >= 0: data = np.recfromcsv(fname) changed = False # look for occurrences in the first column for row in data: if row[0] == rec[0]: row[0] = rec[2] changed = True # look for occurrences in the first row. The header is read in as a # tuple, so we need to do it differently new_names = [] for column in data.dtype.names: if column == str(rec[0]): new_names.append(str(rec[2])) else: new_names.append(column) data.dtype.names = new_names if changed: mlab.rec2csv(data, fname)
ind = np.where(table['JobID'] == sid) for k in dr.split("_"): if k.endswith("src"): Nsrcs[ind] = int(k[:-3]) if k.endswith("time"): Ntimes[ind] = int(k[:-4]) if k.endswith('chan'): Nchans[ind] = int(k[:-4]) if k == 'mwa': Nbls[ind] = 8128 if k == 'triangle': Nbls[ind] = 3 def hms2sec(hms): h, m, s = map(float, hms.split(":")) return h * 60.**2 + m * 60. + s runtime_sec = np.array(map(hms2sec, table['Elapsed'])) cores_per_node = table['NCores'] / table['NNodes'] ntasks = Nsrcs * Ntimes * Nbls * Nchans timepertask = runtime_sec / ntasks table = append_fields(table, ['CoresPerNode', 'Nbls', 'Ntimes', 'Nchan', 'Nsrc', 'Ntasks', 'Runtime_Seconds', 'RuntimePerTask_seconds'], [cores_per_node, Nbls, Ntimes, Nchans, Nsrcs, ntasks, runtime_sec, timepertask]) print table print table.dtype rec2csv(table, 'profiling_results_table.csv')
def export(model_information, model_summary, metrics, lift_table, correlation): ## Export model output in csv blank_type = np.dtype([('Col1', 'S100'), ('Col2', 'S100'),( 'Col3', 'S100'), ('Col4', 'S100'), ('Col5', 'S100'), ('Col6', 'S100'), ('Col7', 'S100')]) blank_line = np.asarray([(" ", " ", " ", " ", " ", " ", " ")], dtype=blank_type) blank_line = np.hstack((blank_line, blank_line)) dot_line = np.asarray([("=======================", "=======================", "=======================", "=======================", "=======================", "=======================", "=======================")], dtype=blank_type) title_1 = np.asarray([(" ", " ", "Model Information", " ", " ", " ", " " )], dtype=blank_type) title_2 = np.asarray([(" ", " ", "Model Summary", " ", " ", " ", " " )], dtype=blank_type) title_3 = np.asarray([(" ", " ", "Metrics", " ", " ", " ", " " )], dtype=blank_type) title_4 = np.asarray([(" ", " ", "Lift Table", " ", " ", " ", " " )], dtype=blank_type) title_5 = np.asarray([(" ", " ", "Correlation Table", " ", " ", " ", " " )], dtype=blank_type) file_name = str('Logistic Regression Output '+datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')+'.csv').replace(":", "-") with open(file_name, 'wb') as outfile: mlab.rec2csv(blank_line, outfile, withheader=False) mlab.rec2csv(title_1, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(model_information, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(blank_line, outfile, withheader=False) mlab.rec2csv(title_2, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(model_summary, outfile, withheader=True) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(blank_line, outfile, withheader=False) mlab.rec2csv(title_3, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(metrics, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(blank_line, outfile, withheader=False) mlab.rec2csv(title_4, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(lift_table.to_records(),outfile, withheader=True) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(blank_line, outfile, withheader=False) mlab.rec2csv(title_5, outfile, withheader=False) mlab.rec2csv(dot_line, outfile, withheader=False) mlab.rec2csv(correlation, outfile, withheader=True) mlab.rec2csv(dot_line, outfile, withheader=False) outfile.close()
def normalize_dat(dat_filename): """Normalise dat filename and return record with normalized value""" names = ['value', 'angle'] c_rec = mlab.csv2rec(dat_filename, names=names, delimiter=' ') total_value = sum(c_rec['value']) c_rec['value'] = c_rec['value'] / total_value return c_rec def merge_dat(dat_rec_list): """Create a new dat file from a list of normalized dat file""" nb_rec = len(dat_rec_list) new_rec = dat_rec_list[0] new_rec['value'] = new_rec['value'] / nb_rec for c_rec in dat_rec_list[1:]: new_rec['value'] += c_rec['value'] / nb_rec return new_rec if __name__ == '__main__': dat_filenames = sys.argv[1:] dat_recs = [] for dat_filename in dat_filenames: dat_rec = normalize_dat(dat_filename) dat_recs.append(dat_rec) new_record = merge_dat(dat_recs) mlab.rec2csv(new_record, 'out.dat', delimiter=' ')
def save_results(results, file_name): ''' save the results to the specified tar.gz file. The results are stored as csv files. There is an x.csv, and a csv for each outcome. In addition, there is a metadata csv which contains the datatype information for each of the columns in the x array. Parameters ---------- results : tuple the return of run_experiments file_name : str the path of the file Raises ------ IOError if file not found ''' file_name = os.path.abspath(file_name) def add_file(tararchive, string_to_add, filename): tarinfo = tarfile.TarInfo(filename) tarinfo.size = len(string_to_add) fh = BytesIO(string_to_add.encode('UTF-8')) z.addfile(tarinfo, fh) def save_numpy_array(fh, data): data = pd.DataFrame(data) data.to_csv(fh, header=False, index=False, encoding='UTF-8') experiments, outcomes = results with tarfile.open(file_name, 'w:gz') as z: # write the x to the zipfile experiments_file = WriterFile() rec2csv(experiments, experiments_file, withheader=True) add_file(z, experiments_file.getvalue(), 'experiments.csv') # write experiment metadata dtype = experiments.dtype.descr dtype = ["{},{}".format(*entry) for entry in dtype] dtype = "\n".join(dtype) add_file(z, dtype, 'experiments metadata.csv') # write outcome metadata outcome_names = outcomes.keys() outcome_meta = [ "{},{}".format(outcome, outcomes[outcome].shape) for outcome in outcome_names ] outcome_meta = "\n".join(outcome_meta) add_file(z, outcome_meta, "outcomes metadata.csv") # outcomes for key, value in outcomes.items(): fh = WriterFile() nr_dim = len(value.shape) if nr_dim == 3: for i in range(value.shape[2]): data = value[:, :, i] save_numpy_array(fh, data) fh = fh.getvalue() fn = '{}_{}.csv'.format(key, i) add_file(z, fh, fn) fh = WriterFile() else: save_numpy_array(fh, value) fh = fh.getvalue() add_file(z, fh, '{}.csv'.format(key)) info("results saved successfully to {}".format(file_name))
def fit_classifier(aml_clean_path, class_path, test=False, performance=False, n_fits=100, test_split=0.2, save_clf=True): '''Fits random forest classifier to aml_ref_clean formatted csv. Note that the species code should be contained in the folder col.''' # Get class_path dir, used for ancillary file names class_dir, tail = os.path.split(class_path) prefix = tail.split('.')[0] # Load refe_features_table table = csv2rec(aml_clean_path) # Only use calls with qual < 0.3 (Armitage) table = table[table.qual < 0.3] # Get target col (y) with integer codes instead of spp names y_str = table.folder # Assumes spp name is in folder col y_str_uniq = set(list(y_str)) n_spp = len(y_str_uniq) spp_codes = range(0, n_spp) code_table = np.array(zip(spp_codes, y_str_uniq), dtype = [('code','<i8'), ('spp', '|S8')]) y = np.zeros(len(y_str)) # Get col of full length with codes, not names for code, spp in code_table: y[y_str == spp] = int(code) # Get filename col for later grouping into passes f = table.filename # Remove non-feature cols from table table = rec_drop_fields(table, ['path', 'folder', 'filename', 'st', 'dc', 'qual', 'pmc']) # Get list of feature names remaining in table feature_names = table.dtype.names # Recarray to ndarray - http://stackoverflow.com/questions/5957380/ # convert-structured-array-to-regular-numpy-array X = table.view((float, len(table.dtype.names))) # Partition data if test, holding portion for testing if not test: X_tr = X y_tr = y f_tr = f X_te = X y_te = y f_te = f else: # Use StratifiedShuffleSplit since train_test_split does not stratify sss = StratifiedShuffleSplit(y, 1, test_size=test_split) for train_index, test_index in sss: # Only once since n_iter=1 above X_tr, X_te = X[train_index], X[test_index] y_tr, y_te = y[train_index], y[test_index] f_tr, f_te = f[train_index], f[test_index] sort_ind = f_te.argsort() # Sort test data for pass analysis later X_te = X_te[sort_ind,:] # Sort rows y_te = y_te[sort_ind] f_te = f_te[sort_ind] # (Train data order does not matter) # Define and fit classifier clf = RandomForestClassifier(n_estimators=n_fits, oob_score=True, compute_importances=True) clf.fit(X_tr, y_tr) # If performance, save various performance metrics # NOTE: Performance of passes is difficult to understand if if test=True, # as the calls in one pass may be split up. if performance: # Get OOB score print 'OOB Score: ', clf.oob_score_ # Predict on test data, which may be held out (test=True) or all data y_te_pr = clf.predict(X_te) # Get true data and predictions by passes pred_te = clf.predict_proba(X_te) # Prob of each spp f_te_p, pred_te_p, other = sum_group(f_te, pred_te, [y_te]) y_te_p = other[0] # Actual spp for each pass y_te_p_pr = [] for row in xrange(len(y_te_p)): # Find pred species for each pass y_te_p_pr.append(pred_te_p[row].argmax()) # First ind, ties bias y_te_p_pr = np.array(y_te_p_pr) # Get accuracy and confusion matrix for calls def make_conf_mat(y_te, y_te_pr, type): conf_mat = metrics.confusion_matrix(y_te, y_te_pr) conf_mat_frac = conf_mat / np.sum(conf_mat, axis=0) print type, ' Accuracy: ', metrics.zero_one_score(y_te, y_te_pr) np.savetxt(os.path.join(class_dir, prefix+'_conf_'+type+'.csv'), conf_mat, fmt='%i', delimiter=',') np.savetxt(os.path.join(class_dir, prefix+'_conffr_'+type+'.csv'), conf_mat_frac, fmt = '%.6f', delimiter=',') make_conf_mat(y_te, y_te_pr, 'call') make_conf_mat(y_te_p, y_te_p_pr, 'pass') # Save spp_code table, feature_names, and pickle classifier rec2csv(code_table, os.path.join(class_dir, prefix + '_spp_codes.csv')) rec2csv(np.array(list(feature_names), dtype=[('features', 'S8')]), os.path.join(class_dir, prefix + '_feature_names.csv')) if save_clf: joblib.dump(clf, class_path, compress = 9)
# Pack it into a recarray: names = ('ppm', 'echo_on', 'echo_off', 'diff') formats = (float, float, float, float) dt = zip(names, formats) m_e1 = np.mean(G.echo_on, 0) m_e2 = np.mean(G.echo_off, 0) diff = m_e2 - m_e1 if in_args.out_file: prep_arr = [(G.f_ppm[i], m_e1[i], m_e2[i], diff[i]) for i in range(len(G.f_ppm))] out_array = np.array(prep_arr, dtype=dt) # And save to output: mlab.rec2csv(out_array, in_args.out_file) G.fit_gaba() if in_args.plot: fig, ax = plt.subplots(3) ax[0].plot(G.f_ppm, m_e1) ax[0].plot(G.f_ppm[G.cr_idx], np.mean(G.creatine_model, 0), 'r') ax[1].plot(G.f_ppm, m_e2) ax[2].plot(G.f_ppm, diff) ax[2].plot(G.f_ppm[G.gaba_idx], np.mean(G.gaba_model, 0), 'r') for a in ax: a.invert_xaxis() a.set_xlabel('ppm') plt.show()
def group_things(list_of_jsons): """Fields to save in output csv - Subject id - Num Outliers - Mincost - All tsnr values (0 for missing values) """ import numpy as np from nipype.utils.filemanip import load_json from bips.workflows.gablab.wips.fmri.preprocessing.group_preproc_QA import extract_snr, extract_art #from bips.workflows.group_preproc_QA import extract_art snr_names = [] snr_dict = {} for tmp in list_of_jsons: a = load_json(tmp) names = [b[0] for b in a['SNR_table'][0][1:]] snr_names += names snr_names = np.unique(snr_names).tolist() for name in snr_names: snr_dict[name] = [] mincost = [] common_outliers = [] total_outliers = [] intensity_outliers = [] motion_outliers = [] subject_id = [] all_fields = [ 'subject_id', 'total_outliers', 'mincost', 'motion_outliers', 'intensity_outliers', 'common_outliers' ] + snr_names dtype = [('subject_id', '|S20')] + [(str(n), 'f4') for n in all_fields[1:]] arr = np.zeros(len(list_of_jsons), dtype=dtype) for fi in list_of_jsons: f = load_json(fi) subject_id.append(f['subject_id']) mot, inten, com, out = extract_art(f['art']) motion_outliers.append(mot) intensity_outliers.append(inten) common_outliers.append(com) total_outliers.append(out) mincost.append(f['mincost'][0]) for n in snr_names: t = extract_snr(f['SNR_table'], n) snr_dict[n].append(t) arr['subject_id'] = subject_id arr['total_outliers'] = total_outliers arr['mincost'] = mincost arr['motion_outliers'] = motion_outliers arr['intensity_outliers'] = intensity_outliers arr['common_outliers'] = common_outliers for key, item in snr_dict.iteritems(): arr[key] = item import os from matplotlib.mlab import rec2csv outfile = os.path.abspath('grouped_metrics.csv') rec2csv(arr, outfile) return outfile
def create_info_table(self, raster_join_field, attribute_file, attribute_join_field, drop_fields=None): """ Create ArcInfo table from attribute csv file Parameters ---------- raster_join_field : str field in raster to use for joining to attribute data attribute_file : str name and path of file containing attribute information attribute_join_field : str field in attribute file to use to join to raster drop_fields : list of str fields in the attribute file to drop before join to raster Returns ------- name of temp ArcInfo table, list of fields to join from info table """ print('Building info table from attribute file') # Crosswalk of numpy types to ESRI types for numeric data numpy_to_esri_type = { ('b', 1): 'SHORT', ('i', 1): 'SHORT', ('i', 2): 'SHORT', ('i', 4): 'LONG', ('f', 4): 'FLOAT', ('f', 8): 'DOUBLE', } # Read the CSV file in to a recarray ra = mlab.csv2rec(attribute_file) col_names = [str(x).upper() for x in ra.dtype.names] ra.dtype.names = col_names # If there are fields to drop, do that now and get a new recarray if drop_fields is not None: # Ensure that the drop fields are actually fields in the current # recarray drop_fields = [x for x in drop_fields if x in ra.dtype.names] # Create a new recarray with these fields omitted ra = mlab.rec_drop_fields(ra, drop_fields) col_names = list(ra.dtype.names) # Get the column types and formats col_types = [(ra.dtype[i].kind, ra.dtype[i].itemsize) for i in range(len(ra.dtype))] formats = [ra.dtype[i].str for i in range(len(ra.dtype))] # Sanitize column names # No field name may be longer than 16 chars # No field name can start with a number for i in range(len(col_names)): if len(col_names[i]) > 16: col_names[i] = col_names[i][0:16] if col_names[i][0].isdigit(): col_names[i] = col_names[i].lstrip('0123456789') # Reset the names for the recarray ra.dtype.names = col_names # Sanitize the data # Change True/False to 1/0 to be read into short type bit_fields = [(i, n) for (i, (n, t)) in enumerate(zip(col_names, col_types)) if t[0] == 'b'] if bit_fields: for rec in ra: for (col_num, field) in bit_fields: value = getattr(rec, field) if value: setattr(rec, field, 1) else: setattr(rec, field, 0) # Change the bit fields to be short integer for (col_num, field) in bit_fields: formats[col_num] = '<i2' # Create a sanitized recarray and output back to CSV temp_csv = os.path.join(env.workspace, 'xxtmp.csv') ra2 = np.rec.fromrecords(ra, names=col_names, formats=formats) mlab.rec2csv(ra2, temp_csv) # Create a scratch name for the temporary ArcInfo table temp_table = arcpy.CreateScratchName('', '', 'ArcInfoTable') # Create the ArcInfo table and add the fields table_name = os.path.basename(temp_table) arcpy.CreateTable_management(env.workspace, table_name) for (n, t) in zip(col_names, col_types): try: esri_type = numpy_to_esri_type[t] arcpy.AddField_management(temp_table, n, esri_type) except KeyError: if t[0] == 'S': arcpy.AddField_management(temp_table, n, 'TEXT', '#', '#', t[1]) else: err_msg = 'Type not found for ' + str(t) print(err_msg) continue # Append the records from the CSV field to the temporary INFO table arcpy.Append_management(temp_csv, temp_table, 'NO_TEST') # Strip out the join field from the names if they are the same raster_join_field = raster_join_field.upper() attribute_join_field = attribute_join_field.upper() if raster_join_field == attribute_join_field: col_names.remove(attribute_join_field) # Create a semi-colon delimited string of the fields we want to join field_list = ';'.join(col_names) # Clean up os.remove(temp_csv) return temp_table, field_list