def resolve_dict_iterator(iterator: Any, nrows: int = None) -> tuple: """Note that this function produces sorted arrays.""" sort_fields = ["row", "col", "amount", "uncertainty_type"] data = (dictionary_formatter(row) for row in iterator) array = create_structured_array( data, INDICES_DTYPE + [("amount", np.float32)] + UNCERTAINTY_DTYPE + [("flip", bool)], nrows=nrows, sort=True, sort_fields=sort_fields, ) return ( array["amount"], # Not repacking fields would cause this multi-field index to return a view # All columns would be serialized # See https://numpy.org/doc/stable/user/basics.rec.html#indexing-structured-arrays repack_fields(array[["row", "col"]]), repack_fields( array[ [ "uncertainty_type", "loc", "scale", "shape", "minimum", "maximum", "negative", ] ] ), array["flip"], )
def test_repack_fields(self): dt = np.dtype('u1,f4,i8', align=True) a = np.zeros(2, dtype=dt) assert_equal(repack_fields(dt), np.dtype('u1,f4,i8')) assert_equal(repack_fields(a).itemsize, 13) assert_equal(repack_fields(repack_fields(dt), align=True), dt) # make sure type is preserved dt = np.dtype((np.record, dt)) assert_(repack_fields(dt).type is np.record)
def pointcloud2_to_array(cloud_msg, split_rgb=False, remove_padding=True): ''' Converts a rospy PointCloud2 message to a numpy recordarray Reshapes the returned array to have shape (height, width), even if the height is 1. The reason for using np.fromstring rather than struct.unpack is speed... especially for large point clouds, this will be <much> faster. ''' # construct a numpy record type equivalent to the point type of this cloud dtype_list = pointcloud2_to_dtype(cloud_msg) # parse the cloud into an array cloud_arr = np.fromstring(cloud_msg.data, dtype_list) # remove the dummy fields that were added if remove_padding: cloud_arr = recfuncs.repack_fields(cloud_arr[[ fname for fname, _type in dtype_list if not (fname[:len(DUMMY_FIELD_PREFIX)] == DUMMY_FIELD_PREFIX) ]]) if split_rgb: cloud_arr = split_rgb_field(cloud_arr) return np.reshape(cloud_arr, (cloud_msg.height, cloud_msg.width))
def recarray_drop_columns(array, columns): ''' Remove columns from rec array ''' to_use = [col for col in array.dtype.names if col not in columns] subset = array.view(subset_dtype(array.dtype, to_use)) return repack_fields(subset)
async def coadd(): ''' ''' release = request.args.get('release', 'dr16') run2d = request.args.get('run2d', None) plate = int(request.args.get('plate', None)) mjd = int(request.args.get('mjd', None)) fiber = int(request.args.get('fiber', 1)) sample = int(request.args.get('sample', 1)) bands = request.args.get('bands', 'all') debug_flag = request.args.get('debug', '') survey = 'sdss' if not run2d.startswith('v') else 'eboss' root = '/ssd0/sdss/%s/%s/spectro/redux/' % (release,survey) path = root + '%s/%04i/' % (run2d, plate) fname = path + 'spec-%04i-%05i-%04i.npy' % (plate,mjd,fiber) print(fname) if bands == 'all': with open(fname, 'r') as fd: _bytes = fd.read() return web.response(body=_bytes) else: data = np.load(str(fname)) dbands = data[[c for c in list(data.dtype.names) if c in bands]] dbands = rfn.repack_fields(dbands) tmp_file = NamedTemporaryFile(delete=False, dir='/tmp').name np.save(tmp_file, dbands, allow_pickle=False) with open(tmp_file+'.npy', 'r') as fd: _bytes = fd.read() os.unlink(tmp_file) return web.response(body=_bytes)
def drop_column(array, column): from numpy.lib.recfunctions import repack_fields cols = list(array.dtype.names) if column in cols: cols.remove(column) return repack_fields(array[cols])
def rec_to_ndarr(rec_arr, data_type=float): """ Function to transform a numpy record array to a nd array. """ # fix for numpy >= 1.16.0 with masked arrays # https://numpy.org/devdocs/release/1.16.0-notes.html#multi-field-views-return-a-view-instead-of-a-copy return np.array(recFunc.structured_to_unstructured( recFunc.repack_fields(rec_arr[list(rec_arr.dtype.names)])), dtype=data_type)
def _rec_to_ndarr(rec_arr, data_type=float): """ Function to transform a numpy record array to a nd array. dupe of SimPEG.electromagnetics.natural_source.utils.rec_to_ndarr to avoid circular import """ # fix for numpy >= 1.16.0 # https://numpy.org/devdocs/release/1.16.0-notes.html#multi-field-views-return-a-view-instead-of-a-copy return np.array(recFunc.structured_to_unstructured(recFunc.repack_fields(rec_arr[list(rec_arr.dtype.names)])), dtype=data_type)
def load_structured_data(file): ## file for data (x,y) if Path(str(file)).is_file(): structured_data = np.genfromtxt(file, delimiter=',', names=True, dtype=float) data = rf.structured_to_unstructured(rf.repack_fields(structured_data)) else: raise FileNotFoundError(file) # raise error data = data.reshape(1, -1) if len(data.shape) == 1 else data names = structured_data.dtype.names return names, data
def polys_to_segments(self, as_basic=True, to_orig=False, as_3d=False): """Segment poly* structures into o-d pairs from start to finish. as_basic : boolean True, returns an Nx4 array (x0, y0, x1, y1) of from-to coordinates. False, returns a structured array If `as_3d` is True, then `as_basic` is set to False. to_origin : boolean True, moves the coordinates back to their original position defined by the `LL` property of the Geo array. as_3d : boolean True, the point pairs are returned as a 3D array in the form [[X_orig', Y_orig'], ['X_dest', 'Y_dest']], without the distances. Notes ----- Use `prn_tbl` if you want to see a well formatted output. """ if self.K not in (1, 2): print("Poly* features required.") return None # -- basic return as ndarray used by common_segments if as_3d: # The array cannot be basic if it is 3d as_basic = False if to_orig: tmp = self.XY + self.LL b_vals = [tmp[ft[0]:ft[1]] for ft in self.FT] # shift to orig extent else: b_vals = self.bits # -- Do the concatenation fr_to = np.concatenate( [np.concatenate((b[:-1], b[1:]), axis=1) for b in b_vals], axis=0) # -- return if simple and not 3d representation if as_basic: return fr_to # -- return 3d from-to representation if as_3d: fr_to = fr_to[:, :4] s0, s1 = fr_to.shape return fr_to.reshape(s0, s1 // 2, s1 // 2) # -- structured array section # add bit ids and lengths to the output array b_ids = self.IFT segs = np.asarray([[[b_ids[i][0], *(b_ids[i][-2:])], len(b) - 1] for i, b in enumerate(b_vals)], dtype='O') s_ids = np.concatenate([np.tile(i[0], i[1]).reshape(-1, 3) for i in segs], axis=0) dist = (np.sqrt(np.sum((fr_to[:, :2] - fr_to[:, 2:4])**2, axis=1))) fr_to = np.hstack((fr_to, s_ids, dist.reshape(-1, 1))) dt = np.dtype([('X_fr', 'f8'), ('Y_fr', 'f8'), ('X_to', 'f8'), ('Y_to', 'f8'), ('Orig_id', 'i4'), ('Part', 'i4'), ('Seq_ID', 'i4'), ('Length', 'f8')]) fr_to = uts(fr_to, dtype=dt) return repack_fields(fr_to)
def convert_pc_to_numpy(pc): # Extracts the 3D array of points - needs to do some management of structured arrays for efficiency pts_struct = numpify(pc)[['x', 'y', 'z']] if numpy_ver[1] >= 15: from numpy.lib.recfunctions import repack_fields pts_struct = repack_fields(pts_struct) pts = pts_struct.view((pts_struct.dtype[0], 3)) if len(pts.shape) == 3: pts = pts.transpose(2, 0, 1).reshape(3, -1).T return pts
def plot_results(setup, results, samples, constraints, gridnames, obs, obs_err, photbands): # check for 10 possible plots. Should be enough for now. for i in range(10): pindex = 'plot' + str(i) if not pindex in setup: continue if setup[pindex]['type'] == 'sed_fit': res = setup[pindex].get('result', 'best') pl.figure(i) pl.clf() pl.subplots_adjust(wspace=0.25) plotting.plot_fit(obs, obs_err, photbands, pars=results, constraints=constraints, grids=setup['grids'], gridnames=gridnames, result=res) if not setup[pindex].get('path', None) is None: pl.savefig(setup[pindex].get('path', 'sed_fit.png')) if setup[pindex]['type'] == 'constraints': pl.figure(i, figsize=(2 * len(constraints), 6)) pl.clf() pl.subplots_adjust(wspace=0.40, left=0.07, right=0.98) plotting.plot_constraints(constraints, samples, results) if not setup[pindex].get('path', None) is None: pl.savefig(setup[pindex].get('path', 'constraints.png')) if setup[pindex]['type'] == 'distribution': pars1 = [] for p in setup[pindex].get('parameters', ['teff', 'rad', 'L', 'd']): if p in samples.dtype.names: pars1.append(p) data = repack_fields(samples[pars1]) if setup[pindex].get('show_best', False): truths = [results[p][0] for p in data.dtype.names] else: truths = None fig = corner.corner(data.view(np.float64).reshape(data.shape + (-1,)), labels=data.dtype.names, quantiles=setup[pindex].get('quantiles', [0.025, 0.16, 0.5, 0.84, 0.975]), levels=setup[pindex].get('levels', [0.393, 0.865, 0.95]), truths=truths, show_titles=True, title_kwargs={"fontsize": 12}, ) if not setup[pindex].get('path', None) is None: pl.savefig(setup[pindex].get('path', 'distribution.png'))
def coadd(): """ """ if request.method == 'POST': # Get the login user name release = request.form['release'] run2d = request.form['run2d'] plate = request.form['plate'] mjd = request.form['mjd'] fiber = request.form['fiber'] sample = request.form['sample'] if 'bands' in request.form: bands = request.form['bands'] else: bands = 'all' debug_flag = request.form['debug'] else: release = request.args.get('release', 'dr16') run2d = request.args.get('run2d', None) plate = request.args.get('plate', None) mjd = request.args.get('mjd', None) fiber = int(request.args.get('fiber', 1)) sample = int(request.args.get('sample', 1)) bands = request.args.get('bands', 'all') debug_flag = request.args.get('debug', '') run2d = int(run2d) plate = int(plate) mjd = int(mjd) root = '/ssd0/sdss/%s/sdss/spectro/redux/' % release path = root + '%d/%04i/' % (run2d, plate) fname = path + 'spec-%04i-%05i-%04i.npy' % (plate, mjd, fiber) if bands == 'all': return send_file(fname, mimetype='application/octet-stream') else: from numpy.lib import recfunctions as rfn from tempfile import NamedTemporaryFile data = np.load(str(fname)) cols = data.dtype.name dbands = data[[c for c in list(data.dtype.names) if c in bands]] dbands = rfn.repack_fields(dbands) #bobj = BytesIO() #np.save(bobj, dbands, allow_pickle=False) #result = send_file(bobj, mimetype='application/octet-stream') #return result tmp_file = NamedTemporaryFile(delete=False, dir='/tmp').name np.save(tmp_file, dbands, allow_pickle=False) result = send_file(tmp_file + '.npy', mimetype='application/octet-stream') os.unlink(tmp_file) return result
def _send_work_order(self, Work, w): """Sends an allocation function order to a worker """ logger.debug("Manager sending work unit to worker {}".format(w)) self.wcomms[w - 1].send(Work['tag'], Work) work_rows = Work['libE_info']['H_rows'] if len(work_rows): if 'repack_fields' in dir(): self.wcomms[w - 1].send( 0, repack_fields(self.hist.H[Work['H_fields']][work_rows])) else: self.wcomms[w - 1].send( 0, self.hist.H[Work['H_fields']][work_rows])
def __init__(self, fileName, fileNameEdges=None, preprocessingArgs=None): self.result_dir = "REVC_results_" + fileName self.figure_dir = "REVC_figures_" + fileName self.fileName = fileName HierarchichalPrinter.__init__(self) if not fileNameEdges: return self.prst("Reading file", fileNameEdges) edges = np.genfromtxt( fileNameEdges, delimiter=",", skip_header=True, dtype={ "names": ["ID", "from_to_original", "length", "inspection", "lakeID"], 'formats': [IDTYPE, '2' + IDTYPE, "double", "3" + IDTYPE, IDTYPE] }, autostrip=True) from_to = np.vstack( (edges["from_to_original"], edges["from_to_original"][:, ::-1])) edgeData = rf.repack_fields(edges[["ID", "length", "lakeID"]]) edgeData = np.concatenate((edgeData, edgeData)) edgeData = add_fields(edgeData, ["inspection"], [object], [None]) vertexID = np.zeros(0, dtype=IDTYPE) vertexData = np.zeros(0, dtype=[("significant", bool)]) graph = FlexibleGraph(from_to, edgeData, vertexID, vertexData, replacementMode="shortest", lengthLabel="length") graph.set_default_vertex_data(True) super().__init__(graph, "length", "significant") self.preprocessing(preprocessingArgs) if fileName: self.save(fileName)
def mst(arr, calc_dist=True): """Determine the minimum spanning tree for a set of points represented by their inter-point distances. ie their `W`eights Parameters ---------- W : array, normally an interpoint distance array Edge weights for example, distance, time, for a set of points. W needs to be a square array or a np.triu perhaps calc_dist : boolean True, if W is a points array, calculate W as the interpoint distance. False means that W is not a points array, but some other `weight` representing the interpoint relationship Returns ------- pairs - the pair of nodes that form the edges """ arr = np.unique(arr, True, False, False, axis=0)[0] W = arr[~np.isnan(arr[:, 0])] a_copy = np.copy(W) if calc_dist: W = _e_dist_(W) if W.shape[0] != W.shape[1]: raise ValueError("W needs to be square matrix of edge weights") Np = W.shape[0] pairs = [] pnts_seen = [0] # Add the first point n_seen = 1 # exclude self connections by assigning inf to the diagonal diag = np.arange(Np) W[diag, diag] = np.inf # while n_seen != Np: new_edge = np.argmin(W[pnts_seen], axis=None) new_edge = divmod(new_edge, Np) new_edge = [pnts_seen[new_edge[0]], new_edge[1]] pairs.append(new_edge) pnts_seen.append(new_edge[1]) W[pnts_seen, new_edge[1]] = np.inf W[new_edge[1], pnts_seen] = np.inf n_seen += 1 pairs = np.array(pairs) frum = a_copy[pairs[:, 0]] too = a_copy[pairs[:, 1]] fr_to = np.concatenate((frum, too), axis=1) # np.vstack(pairs) fr_to = uts(fr_to, names=['X_orig', 'Y_orig', 'X_dest', 'Y_dest']) return repack_fields(fr_to)
def rasterize(moc_data, order=None): """Convert a multi-order HEALPix dataset to fixed-order NESTED ordering. Parameters ---------- moc_data : `numpy.ndarray` A multi-order HEALPix dataset stored as a Numpy record array whose first column is called UNIQ and contains the NUNIQ pixel index. Every point on the unit sphere must be contained in exactly one pixel in the dataset. order : int, optional The desired output resolution order, or :obj:`None` for the maximum resolution present in the dataset. Returns ------- nested_data : `numpy.ndarray` A fixed-order, NESTED-ordering HEALPix dataset with all of the columns that were in moc_data, with the exception of the UNIQ column. """ if order is None or order < 0: order = -1 else: orig_order, orig_nest = uniq2nest(moc_data['UNIQ']) to_downsample = order < orig_order if np.any(to_downsample): to_keep = table.Table(moc_data[~to_downsample], copy=False) orig_order = orig_order[to_downsample] orig_nest = orig_nest[to_downsample] to_downsample = table.Table(moc_data[to_downsample], copy=False) ratio = 1 << (2 * np.int64(orig_order - order)) weights = 1.0 / ratio for colname, column in to_downsample.columns.items(): if colname != 'UNIQ': column *= weights to_downsample['UNIQ'] = nest2uniq(order, orig_nest // ratio) to_downsample = to_downsample.group_by('UNIQ').groups.aggregate( np.sum) moc_data = table.vstack((to_keep, to_downsample)) # Ensure that moc_data has appropriate padding for each of its columns to # be properly aligned in order to avoid undefined behavior. moc_data = repack_fields(np.asarray(moc_data), align=True) return _rasterize(moc_data, order=order)
def _reorder_prob(self, prob, model, gmm): """Reorder probabilities to have consistent output. Parameters ---------- prob : numpy.ndarray, shape (N, n_components) Sample at which to predict. model : str Model used for prediction. gmm : sklearn.mixture.GaussianMixture GMM object of model used for prediction. Returns ------- gmm_p : numpy.ndarray, shape (N, n_components) Unsorted probabilities output by GMM prediction. """ n_components = _models[model]['n_components'] if n_components == 3: pins = _polin_pins elif n_components == 4: pins = _branch_pins fields = _models[model]['fields'] pin_data = pins[fields] # sklearn.gmm can't take in structured arrays, so a workaround... try: arr = pin_data[fields].copy().view((float, len(fields))) pin_prob = gmm.predict_proba(arr) except ValueError: arr = repack_fields(pin_data[fields]).view((float, len(fields))) pin_prob = gmm.predict_proba(arr) ordered_indices = [np.argmax(p) for p in pin_prob] # check for duplicates if len(set(ordered_indices)) != n_components: print(f'{model} probabilities were not reordered') return prob prob[:, list(range(n_components))] = prob[:, ordered_indices] return prob
def job(worker_id): i_start, i_end = ranges[worker_id] # collect data for i in range(i_start, i_end): # load file into Numpy structured array data = np.genfromtxt(valid_files[i], skip_header=1, dtype=SAMPLE_DTYPE, delimiter=',') # remove field data_cleaned = repack_fields(rmfield(data, 'lon', 'lat')) np.savetxt(valid_files[i], data_cleaned, delimiter=',', header=','.join(new_header), fmt=','.join( ['%d', '%f', '%f', '%f', '%f', '%f', '%f', '%s'])) return
def crosstab_array(a, flds=None): """Frequency and crosstabulation for structured arrays. Parameters ---------- a : array Input structured array. flds : string or list Fields/columns to use in the analysis. For a single column, a string is all that is needed. Multiple columns require a list of field names. Notes ----- (1) Slice the input array by the classification fields. (2) Sort the sliced array using the flds as sorting keys. (3) Use unique on the sorted array to return the results. (4) Reassemble the original columns and the new count data. """ if flds is None: return None if isinstance(flds, (str)): flds = [flds] a = repack_fields(a[flds]) # need to repack fields # a = _keep_fields(a, flds) # alternative to repack_fields idx = np.argsort(a, axis=0, order=flds) # (2) sort a_sort = a[idx] uni, cnts = np.unique(a_sort, return_counts=True) # (3) unique, count dt = uni.dtype.descr dt.append(('Count', '<i4')) fr = np.empty_like(uni, dtype=dt) names = fr.dtype.names vals = list(zip(*uni)) + [cnts.tolist()] # (4) reassemble N = len(names) for i in range(N): fr[names[i]] = vals[i] return fr
async def getSpec(request): ''' ''' params = await request.post() try: id_list = params['id_list'] values = params['values'] # NYI cutout = params['cutout'] # NYI fmt = params['format'] align = (params['align'].lower() == 'true') w0 = float(params['w0']) w1 = float(params['w1']) context = params['context'] profile = params['profile'] debug = (params['debug'].lower() == 'true') verbose = (params['verbose'].lower() == 'true') except Exception as e: logging.error('Param Error: ' + str(e)) return web.Response(text='Param Error: ' + str(e)) st_time = time.time() # Instantiate the dataset service based on the context. svc = _getSvc(context) svc.debug = debug svc.verbose = verbose # From the service call we get a string which we'll need to map to # an array of identifiers valid for the service. ids = svc.expandIDList(id_list) if debug: print('GETSPEC ----------') print('len ids = ' + str(len(ids))) print('ty ids = ' + str(type(ids))) print('ty ids elem = ' + str(type(ids[0]))) # If called from something other than the client API we might not know # the wavelength limits of the collection, so compute it here so we can # still align properly. if w0 in [None, 0.0] and w1 in [None, 0.0] and align: w0, w1, nspec = _listSpan(svc, ids) res = None align = (w0 != w1) nspec = 0 ptime = 0.0 for id in ids: p0 = time.time() nspec = nspec + 1 if fmt.lower() == 'fits': fname = svc.dataPath(id, 'fits') data = svc.readFile(str(fname)) return web.Response(body=data) else: fname = svc.dataPath(id, 'npy') data = svc.getData(str(fname)) if values != 'all': # Extract the subset of values. dvalues = data[[c for c in list(data.dtype.names) if c in values]] data = rfn.repack_fields(dvalues) if not align: f = data else: wmin, wmax = data['loglam'][0], data['loglam'][-1] disp = float((wmax - wmin) / float(len(data['loglam']))) lpad = int(np.around(max((wmin - w0) / disp, 0.0))) rpad = int(np.around(max((w1 - wmax) / disp, 0.0))) if lpad == 0 and rpad == 0: f = data else: f = np.pad(data, (lpad, rpad), mode='constant', constant_values=0) f['loglam'] = np.linspace(w0, w1, len(f)) # patch wavelength array if debug: print(str(id)) print(fname) print('wmin,wmax = (%g,%g) disp=%g' % (wmin, wmax, disp)) print('w0,w1 = (%g,%g) pad = (%d,%d)' % (w0, w1, lpad, rpad)) print('len f = %d len data = %d' % (len(f), len(data))) if res is None: res = f else: res = np.vstack((res, f)) p1 = time.time() ptime = ptime + (p1 - p0) if debug: print('res type: ' + str(type(res)) + ' shape: ' + str(res.shape)) # Convert the array to bytes for return. fd = BytesIO() np.save(fd, res, allow_pickle=False) _bytes = fd.getvalue() en_time = time.time() logging.info ('getSpec time: %g NSpec: %d Bytes: %d' % \ (en_time-st_time,nspec,len(_bytes))) return web.Response(body=_bytes)
def download_and_repack(country_code=None, network_code=None, circle=None, token=None, source=None, destination=None, byte_order="b", verbose=False): """ Downloads and packs the base station data. So far, only opencellid.org supported. Args: country_code (int): the country code; network_code (int): the network code; circle (tuple): latitude, longitude (degrees) and radius in km; token (str): service token; source (str): downloaded file name; destination (str): destination file name; mnc_block_size (int): the size of the mnc block; mcc_block_size (int): the size of the mcc block; byte_order (str): byte order; verbose (bool): prints verbose output; """ def v(*args, **kwargs): if verbose: print(*args, **kwargs) byte_order = byte_order.lower() if byte_order not in "bl": raise ValueError("Unknown byte order: {}".format(byte_order)) byte_order = dict(b=">", l="<")[byte_order] if destination is None: if country_code is None: destination = "all.bin" else: if network_code is None: destination = "{country_code}.bin".format( country_code=country_code) else: destination = "{country_code}-{network_code}.bin".format( country_code=country_code, network_code=network_code) v("Target: {}".format(destination)) if source is None: if token is None: v("No token specified: downloading from git") if country_code is None: raise ValueError("Cannot download worldwide database yet") else: url = "https://github.com/pulkin/agps-data/raw/master/opencellid.org/{country_code}.csv.gz".format( country_code=country_code) else: if country_code is None: url = "https://opencellid.org/ocid/downloads?token={token}&type=full&file=cell_towers.csv.gz".format( token=token) else: url = "https://opencellid.org/ocid/downloads?token={token}&type=mcc&file={country_code}.csv.gz".format( token=token, country_code=country_code) v("Downloading {} ...".format(url)) response = urllib.request.urlopen( urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' })) buf = io.BytesIO(response.read()) else: v("Reading {} ...".format(source)) buf = open(source, 'rb') v("Unzipping ...") buf_raw = gzip.GzipFile(fileobj=buf, mode='rb') buf_txt = io.TextIOWrapper(buf_raw) v("Parsing ...") dtype = [ ('radio_type', 'S4'), ('mcc', byte_order + 'u2'), ('mnc', byte_order + 'u2'), ('area_code', byte_order + 'u2'), ('cell', byte_order + 'u2'), ('lon', byte_order + 'f4'), ('lat', byte_order + 'f4'), ] data = numpy.genfromtxt(buf_txt, dtype=dtype, skip_header=1, delimiter=",", usecols=(0, 1, 2, 3, 4, 6, 7)) v("Filtering ...") data = data[data["radio_type"] == b"GSM"][[ "mcc", "mnc", "area_code", "cell", "lon", "lat" ]] if country_code is not None: v(" - mcc: {:d}".format(len(data)), end="") data = data[data["mcc"] == country_code] v(" -> {:d}".format(len(data))) if network_code is not None: v(" - mnc: {:d}".format(len(data)), end="") data = data[data["mnc"] == network_code] v(" -> {:d}".format(len(data))) if circle is not None: v(" - circle: {:d}".format(len(data)), end="") theta0, phi0, r0 = circle theta = (data["lat"] - theta0) * numpy.pi / 180 phi = (data["lon"] - phi0) * numpy.pi / 180 mask = (theta**2 + phi**2 * numpy.cos(theta0)**2) < (r0 * 1e3 / earth_radius)**2 data = data[mask] v(" -> {:d}".format(len(data))) if len(data) == 0: raise ValueError("No data to save") v("Sorting ...") data = numpy.sort(data, order=("mcc", "mnc", "area_code", "cell")) v("Items total: {:d}".format(len(data))) v("Preparing tables ...") keys = "mcc", "mnc" mask = numpy.zeros(len(data), dtype=bool) mask[0] = True for k in keys: mask[1:] |= data[k][1:] != data[k][:-1] table_ptrs = numpy.where(mask)[0] table_data = recfunctions.repack_fields(data[table_ptrs][list(keys)]) v("Saving ...") with open(destination, 'wb') as f: f.write(b'agps-bin') f.write(b'\x00') f.write({">": b">", "<": b"<"}[byte_order]) f.write(struct.pack(byte_order + "L", len(table_ptrs))) for _d, _p in zip(table_data, table_ptrs): f.write(struct.pack(byte_order + "HHL", *_d, _p)) recfunctions.repack_fields(data[["area_code", "cell", "lon", "lat"]]).tofile(f) v("Total size: {:d} bytes".format(f.tell())) v("Done")
def write(self, filename, nside_output, write_pos=False): """Write galaxy catalog to disk. Returns ------- None """ domain = self.nbody.domain if 'ID' not in list(self.catalog.keys()): self.catalog['ID'] = np.zeros(len(self.catalog['PX'])) if 'TRA' not in list(self.catalog.keys()): self.catalog['TRA'], self.catalog['TDEC'] = hp.vec2ang(np.vstack( [self.catalog['PX'], self.catalog['PY'], self.catalog['PZ']]).T, lonlat=True) self.catalog['EPSILON'] = np.zeros((len(self.catalog['PX']), 2)) self.catalog['SIZE'] = np.zeros(len(self.catalog['PX'])) self.catalog['KAPPA'] = np.zeros(len(self.catalog['PX'])) self.catalog['MU'] = np.zeros(len(self.catalog['PX'])) self.catalog['W'] = np.zeros(len(self.catalog['PX'])) self.catalog['GAMMA1'] = np.zeros(len(self.catalog['PX'])) self.catalog['GAMMA2'] = np.zeros(len(self.catalog['PX'])) self.catalog['DEC'] = np.zeros(len(self.catalog['PX'])) self.catalog['RA'] = np.zeros(len(self.catalog['PX'])) self.catalog['LMAG'] = np.zeros_like(self.catalog['TMAG']) self.catalog['OMAG'] = np.zeros_like(self.catalog['TMAG']) self.catalog['OMAGERR'] = np.zeros_like(self.catalog['TMAG']) self.catalog['FLUX'] = np.zeros_like(self.catalog['TMAG']) self.catalog['IVAR'] = np.zeros_like(self.catalog['TMAG']) cdtype = np.dtype( list( zip(self.catalog.keys(), [(self.catalog[k].dtype.type, self.catalog[k].shape[1]) if len(self.catalog[k].shape) > 1 else self.catalog[k].dtype.type for k in self.catalog.keys()]))) out = np.zeros(len(self.catalog[list(self.catalog.keys())[0]]), dtype=cdtype) for k in self.catalog.keys(): out[k] = self.catalog[k] r = np.sqrt(out['PX']**2 + out['PY']**2 + out['PZ']**2) pix = hp.vec2pix(domain.nside, out['PX'], out['PY'], out['PZ'], nest=domain.nest) boxnum = domain.boxnum # cut off buffer region, make sure we only have the pixel we want print('Cutting catalog to {} <= z < {}'.format( self.nbody.cosmo.zofR(domain.rbins[boxnum][domain.rbin]), self.nbody.cosmo.zofR(domain.rbins[boxnum][domain.rbin + 1]))) sys.stdout.flush() idx = ((domain.rbins[boxnum][domain.rbin] <= r) & (r < domain.rbins[boxnum][domain.rbin + 1]) & (domain.pix == pix)) out = out[idx] del idx keys = list(self.catalog.keys()) if len(keys) == 0: return for k in keys: del self.catalog[k] del self.catalog if nside_output != domain.nside: map_in = np.arange(12 * domain.nside**2) if domain.nest: order = 'NESTED' else: order = 'RING' map_out = hp.ud_grade(map_in, nside_output, order_in=order, order_out=order) pix, = np.where(map_out == domain.pix) else: pix = [domain.pix] for p in pix: fname = '{}.{}.fits'.format(filename, p) print('Writing to {}'.format(fname)) if write_pos: pfname = '{}.{}.lens.fits'.format(filename, p) if os.path.exists(fname): f = fitsio.FITS(fname) ngal = f[-1].read_header()['NAXIS2'] f.close() else: ngal = 0 pix = hp.vec2pix(nside_output, out['PX'], out['PY'], out['PZ'], nest=domain.nest) idx = pix == p if np.sum(idx) < 100: continue out['ID'][idx] = (p * 1e9 + np.arange(len(out['PX'][idx])) + ngal).astype(np.int64) if os.path.exists(fname): with fitsio.FITS(fname, 'rw') as f: f[-1].append(out[idx]) else: fitsio.write(fname, out[idx]) if write_pos: if os.path.exists(pfname): with fitsio.FITS(pfname, 'rw') as f: f[-1].append( repack_fields(out[['ID', 'PX', 'PY', 'PZ']][idx])) else: fitsio.write( pfname, repack_fields(out[['ID', 'PX', 'PY', 'PZ']][idx])) del out
def convert2hdf5(modellist, star_columns=None, binary_columns=None, profile_columns=None, add_stopping_condition=True, skip_existing=True, star1_history_file='LOGS/history1.data', star2_history_file='LOGS/history2.data', binary_history_file='LOGS/binary_history.data', log_file='log.txt', profile_files=None, profiles_path='', profile_pattern='*.profile', input_path_kw='path', input_path_prefix='', output_path=None, verbose=False): if not os.path.isdir(output_path): os.mkdir(output_path) for i, model in modellist.iterrows(): print(input_path_prefix, model[input_path_kw]) if not os.path.isdir(Path(input_path_prefix, model[input_path_kw])): continue if skip_existing and os.path.isfile( Path(output_path, model[input_path_kw]).with_suffix('.h5')): if verbose: print(i, model[input_path_kw], ': exists, skipping') continue if verbose: print(i, model[input_path_kw], ': processing') # store all columns of the input file in the hdf5 file data = {} extra_info = {} for col in model.index: extra_info[col] = model[col] # obtain the termination code and store if requested termination_code = 'uk' if add_stopping_condition: lines = get_end_log_file( Path(input_path_prefix, model[input_path_kw], log_file)) for line in lines: if 'termination code' in line: termination_code = line.split()[-1] extra_info['termination_code'] = termination_code # store the nnaps-version in the output data. extra_info['nnaps-version'] = __version__ data['extra_info'] = extra_info # check if all history files that are requested are available and can be read. If there is an error, # skip to the next model history = {} if star1_history_file is not None: try: d1 = read_mesa_output( Path(input_path_prefix, model[input_path_kw], star1_history_file))[1] if star_columns is not None: d1 = rf.repack_fields(d1[star_columns]) history['star1'] = d1 except Exception as e: if verbose: print("Error in reading star1: ", e) continue if star2_history_file is not None: try: d2 = read_mesa_output( Path(input_path_prefix, model[input_path_kw], star2_history_file))[1] if star_columns is not None: d2 = rf.repack_fields(d2[star_columns]) history['star2'] = d2 except Exception as e: if verbose: print("Error in reading star2: ", e) continue if binary_history_file is not None: try: d3 = read_mesa_output( Path(input_path_prefix, model[input_path_kw], binary_history_file))[1] if star_columns is not None: d3 = rf.repack_fields(d3[binary_columns]) history['binary'] = d3 except Exception as e: if verbose: print("Error in reading binary: ", e) continue data['history'] = history # check if profiles exists and store them is requested. Also make a profile lookup table (legend) profiles = {} profile_legend = [] profile_name_length = 0 # store longest profile name to create recarray of profile_legend if profile_files is not None: if profile_files == 'all': profile_paths = Path(input_path_prefix, model[input_path_kw], profiles_path).glob(profile_pattern) else: profile_paths = [ Path(input_path_prefix, model[input_path_kw], profiles_path, p) for p in profile_files ] for filepath in profile_paths: if not filepath.is_file(): continue profile_name = filepath.stem header, profile_data = read_mesa_output(filename=filepath, only_first=False) if profile_columns is not None: profile_data = rf.repack_fields( profile_data[profile_columns]) profiles[profile_name] = profile_data if len(profile_name) > profile_name_length: profile_name_length = len(profile_name) profile_legend.append((header['model_number'], profile_name)) if len(profiles.keys()) >= 1: data['profiles'] = profiles profile_legend = np.array(profile_legend, dtype=[('model_number', 'f8'), ('profile_name', 'a' + str(profile_name_length))]) data['profile_legend'] = profile_legend # rather annoying way to assure that Path doesn't cut of part of the folder name when adding the .h5 suffix # if not this will happen: M1.080_M0.502_P192.67_Z0.01129 -> M1.080_M0.502_P192.67_Z0.h5 output_file = Path(output_path, model[input_path_kw]) output_file = output_file.with_suffix(output_file.suffix + '.h5') fileio.write2hdf5(data, output_file, update=False)
def process_sim(idx, total, net, sim_flp, tmp_dir, warmup_prc, keep_prc, sequential=False): """ Loads and processes data from a single simulation. For logging purposes, "idx" is the index of this simulation amongst "total" simulations total. Uses "net" to determine the relevant input and output features. "sim_flp" is the path to the simulation file. The parsed results are stored in "tmp_dir". Drops the first "warmup_prc" percent of packets. Of the remaining packets, only "keep_prc" percent are kept. See utils.save_tmp_file() for the format of the results file. Returns the path to the results file and a descriptive utils.Sim object. """ sim, dat = utils.load_sim(sim_flp, msg=f"{idx + 1:{f'0{len(str(total))}'}}/{total}") if dat is None: return None # Drop the first few packets so that we consider steady-state behavior only. dat = dat[math.floor(dat.shape[0] * warmup_prc / 100):] # Split each data matrix into two separate matrices: one with the input # features only and one with the output features only. The names of the # columns correspond to the feature names in in_spc and out_spc. assert net.in_spc, "{sim_flp}: Empty in spec." assert net.out_spc, "{sim_flp}: Empty out spec." dat_in = recfunctions.repack_fields(dat[net.in_spc]) dat_out = recfunctions.repack_fields(dat[net.out_spc]) # Convert output features to class labels. dat_out_raw = dat_out dat_out = net.convert_to_class(sim, dat_out) # If the results contains NaNs or Infs, then discard this # simulation. def has_non_finite(arr): for fet in arr.dtype.names: if not np.isfinite(arr[fet]).all(): print(f" Simulation {sim_flp} has NaNs of Infs in feature " f"{fet}") return True return False if has_non_finite(dat_in) or has_non_finite(dat_out): return None # Verify data. assert dat_in.shape[0] == dat_out.shape[0], \ f"{sim_flp}: Input and output should have the same number of rows." # Find the uniques classes in the output features and make sure # that they are properly formed. Assumes that dat_out is a # structured numpy array containing a column named "class". for cls in set(dat_out["class"].tolist()): assert 0 <= cls < net.num_clss, f"Invalid class: {cls}" # Transform the data as required by this specific model. dat_in, dat_out, dat_out_raw, dat_out_oracle, scl_grps = net.modify_data( sim, dat_in, dat_out, dat_out_raw, # Must put the column name in a list for the result to be # a structured array. dat_out_oracle=dat[["mathis model label-ewma-alpha0.01"]], sequential=sequential) # Select a fraction of the data. num_rows = dat_in.shape[0] num_to_pick = math.ceil(num_rows * keep_prc / 100) idxs = np.random.random_integers(0, num_rows - 1, num_to_pick) dat_in = dat_in[idxs] dat_out = dat_out[idxs] dat_out_raw = dat_out_raw[idxs] dat_out_oracle = dat_out_oracle[idxs] # To avoid errors with sending large matrices between processes, # store the results in a temporary file. dat_flp = path.join(tmp_dir, f"{path.basename(sim_flp)[:-4]}_tmp.npz") utils.save_tmp_file(dat_flp, dat_in, dat_out, dat_out_raw, dat_out_oracle, scl_grps) return dat_flp, sim
def extract_fets(dat, split_name, net): """ Extracts net's the input and output features from dat. Returns a tuple of the form: (dat_in, dat_out, dat_extra, scaling groups). """ # Split each data matrix into two separate matrices: one with the input # features only and one with the output features only. The names of the # columns correspond to the feature names in in_spc and out_spc. assert net.in_spc, f"{net.name}: Empty in spec." num_out_fets = len(net.out_spc) # This is not a strict requirement from a modeling point of view, # but is assumed to make data processing easier. assert num_out_fets == 1, \ (f"{net.name}: Out spec must contain a single feature, but actually " f"contains: {net.out_spc}") # Remove samples where the ground truth output is unknown. len_before = dat.shape[0] dat = dat[dat[list(net.out_spc)] != -1][0] removed = dat.shape[0] - len_before if removed > 0: print(f"Removed {removed} rows with unknown out_spc from split " f"\"{split_name}\".") dat_in = recfunctions.repack_fields(dat[list(net.in_spc)]) dat_out = recfunctions.repack_fields(dat[list(net.out_spc)]) # Create a structured array to hold extra data that will not be used as # features but may be needed by the training/testing process. dtype_extra = ( # The "raw" entry is the unconverted out_spc. [("raw", [typ for typ in dat.dtype.descr if typ[0] in net.out_spc][0][1])] + [typ for typ in dat.dtype.descr if typ[0] in features.EXTRA_FETS]) dat_extra = np.empty(shape=dat.shape, dtype=dtype_extra) dat_extra["raw"] = dat_out for typ in features.EXTRA_FETS: dat_extra[typ] = dat[typ] dat_extra = recfunctions.repack_fields(dat_extra) is_dt = isinstance(net, models.HistGbdtSklearnWrapper) if not is_dt: # Verify that there are no NaNs or Infs in the data. for fet in dat_in.dtype.names: assert (not ( np.isnan(dat_in[fet]).any() or np.isinf(dat_in[fet]).any())), \ ("Warning: NaNs or Infs in input feature for split " f"\"{split_name}\": {fet}") assert (not ( np.isnan(dat_out[features.LABEL_FET]).any() or np.isinf(dat_out[features.LABEL_FET]).any())), \ f"Warning: NaNs or Infs in ground truth for split \"{split_name}\"." if dat_in.shape[0] > 0: # Convert all instances of -1 (feature value unknown) to either the mean for # that feature or NaN. bad_fets = [] for fet in dat_in.dtype.names: invalid = dat_in[fet] == -1 if invalid.all(): bad_fets.append(fet) continue dat_in[fet][invalid] = (float("NaN") if is_dt else np.mean( dat_in[fet][np.logical_not(invalid)])) assert (dat_in[fet] != -1).all(), \ f"Found \"-1\" in split \"{split_name}\" feature: {fet}" assert not bad_fets, \ (f"Features in split \"{split_name}\" contain only \"-1\" " f"({len(bad_fets)}): {bad_fets}") # Convert output features to class labels. dat_out = net.convert_to_class(dat_out) # Verify data. assert dat_in.shape[0] == dat_out.shape[0], \ "Input and output should have the same number of rows." # Find the uniques classes in the output features and make sure that they # are properly formed. Assumes that dat_out is a structured numpy array # containing a single column specified by features.LABEL_FET. for cls in np.unique(dat_out[features.LABEL_FET]).tolist(): assert 0 <= cls < net.num_clss, f"Invalid class: {cls}" # Transform the data as required by this specific model. # TODO: Refactor this to be compatible with bulk data splits. # dat_in, dat_out, dat_extra, scl_grps = net.modify_data( # exp, dat_in, dat_out, dat_extra, sequential=sequential) scl_grps = list(range(len(dat_in.dtype.names))) return dat_in, dat_out, dat_extra, scl_grps
def fc_to_Geo(in_fc, geom_kind=2, minX=0, minY=0, sp_ref=None, info=""): """Convert a FeatureClassToNumPyArray to a Geo array. This works with the geometry only. Skip the attributes for later. The processing requirements are listed below. Just copy and paste. Parameters ---------- in_fc : featureclass Featureclass in a file geodatabase. geom_kind : integer Points (0), Polylines (1) and Polygons (2) minX, minY : numbers If these values are 0, then the minimum values will be determined and used to shift the data towards the origin. sp_ref : text Spatial reference name. eg `'NAD_1983_CSRS_MTM_9'` Notes ----- The `arcpy.da.Describe` method takes a substantial amount of time. >>> %timeit Describe(fc2) ... 355 ms ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) """ def _area_part_(a): """Mini e_area, used by areas and centroids.""" x0, y1 = (a.T)[:, 1:] x1, y0 = (a.T)[:, :-1] e0 = np.einsum('...i,...i->...i', x0, y0) e1 = np.einsum('...i,...i->...i', x1, y1) return np.sum((e0 - e1) * 0.5) def _cw_(a): """Clockwise check.""" return 1 if _area_part_(a) > 0. else 0 # -- (1) Foundational steps # Create the array, extract the object id values. # To avoid floating point issues, extract the coordinates, round them to a # finite precision and shift them to the x-y origin # kind = geom_kind if sp_ref is None: # sp_ref = get_SR(in_fc, verbose=False) sp_ref = "undefined" a = FeatureClassToNumPyArray( in_fc, ['OID@', 'SHAPE@X', 'SHAPE@Y'], explode_to_points=True) # spatial_reference=sp_ref oids = a['OID@'] xy = a[['SHAPE@X', 'SHAPE@Y']] mn = [np.min(xy['SHAPE@X']), np.min(xy['SHAPE@Y'])] mx = [np.max(xy['SHAPE@X']), np.max(xy['SHAPE@Y'])] extent = np.array([mn, mx]) # -- shift if needed dx, dy = mn if minX != 0.: dx = minX # mn[0] - minX if minY != 0.: dy = minY # mn[1] - minY xy['SHAPE@X'] = np.round(xy['SHAPE@X'] - dx, 3) xy['SHAPE@Y'] = np.round(xy['SHAPE@Y'] - dy, 3) xy.dtype.names = ['X', 'Y'] xy = repack_fields(xy) # # -- (2) Prepare the oid data for use in identifying from-to points. uniq, indx, cnts = np.unique(oids, True, return_counts=True) id_vals = oids[indx] indx = np.concatenate((indx, [a.shape[0]])) # # -- (3) Construct the IFT data using `id_fr_to` to carry the load. IFT_ = np.asarray(id_fr_to(xy, oids)) cols = IFT_.shape[0] IFT = np.full((cols, 6), -1, dtype=np.int32) IFT[:, :3] = IFT_ # # -- (4) clockwise check for polygon parts to identify outer/inner rings if kind == 2: # polygons xy_arr = stu(xy) # View the data as an unstructured array cl_wise = np.array([_cw_(xy_arr[i[1]:i[2]]) for i in IFT_]) else: # not relevant for polylines or points cl_wise = np.full_like(oids, -1) IFT[:, 3] = cl_wise # # -- (5) construct part_ids and pnt_nums if kind == 2: parts = [np.cumsum(IFT[:, 3][IFT[:, 0] == i]) for i in id_vals] part_ids = np.concatenate(parts) ar = np.where(IFT[:, 3] == 1)[0] ar0 = np.stack((ar[:-1], ar[1:])).T pnt_nums = np.zeros(IFT.shape[0], dtype=np.int32) for (i, j) in ar0: # now provide the point numbers per part per shape pnt_nums[i:j] = np.arange((j - i)) # smooth!!! else: part_ids = np.ones_like(oids) pnt_nums = np.ones_like(oids) IFT[:, 4] = part_ids IFT[:, 5] = pnt_nums # # -- (6) Create the output array... as easy as ``a`` to ``z`` z = Geo(xy_arr, IFT, kind, Extent=extent, Info="test", SR=sp_ref) out = copy.deepcopy(z) return out