def retain_relevant_fields(data): #TODO read these from a conf file aggregate_fields = {} aggregate_fields['aggr0_6'] = data['ika0'] + data['ika1'] + data[ 'ika2'] + data['ika3'] + data['ika4'] + data['ika5'] + data['ika6'] aggregate_fields['aggr7_12'] = data['ika7'] + data['ika8'] + data[ 'ika9'] + data['ika10'] + data['ika11'] + data['ika12'] aggregate_fields['aggr13_17'] = data['ika13'] + data['ika14'] + data[ 'ika15'] + data['ika16'] + data['ika17'] aggregate_fields['aggr18_29'] = data['ika18'] + data['ika19'] + data[ 'ika20'] + data['ika21'] + data['ika22'] + data['ika23'] + data[ 'ika24'] + data['ika25_29'] aggregate_fields['aggr30_64'] = data['ika30_34'] + data['ika35_39'] + data[ 'ika40_44'] + data['ika45_49'] + data['ika50_54'] + data[ 'ika55_59'] + data['ika60_64'] aggregate_fields['aggr64_'] = data['ika65_69'] + data['ika70_74'] + data[ 'ika75_79'] + data['ika80_84'] + data['ika85_89'] + data[ 'ika90_94'] + data['ika95_'] #add the fields to data rec array augmented_data = rf.rec_append_fields(data, aggregate_fields.keys(), aggregate_fields.values()) #...and add these fields later. they are here for their column names, but the line above would cause an exception if we added them before aggregate_fields['asyht'] = data['asyht'] aggregate_fields['ruots'] = data['ruots'] aggregate_fields['ekoord'] = data['ekoord'] aggregate_fields['nkoord'] = data['nkoord'] # drop all fields whose names are not in aggregate_fields fields2drop = [ d for d in data.dtype.names if d not in aggregate_fields.keys() ] return rf.rec_drop_fields(augmented_data, fields2drop)
def move_bad_fields_to_bottom(oldArray, orderedFieldList, orderedTypeList): """ Move the given fields in a structured array to the bottom and change their type Input ----- oldArray : numpy structured array previous array to modify orderFieldList : list list of fields to move and change type orderedTypeList : list list of new types for the fields Returns an array with some fields moved to the bottom and with a different type """ outArray = oldArray.copy() for name, typ in zip(orderedFieldList, orderedTypeList): #Remove field of interest from the array tmpArray = rec.rec_drop_fields(outArray, name) #Append the same field at the end of the array with the right data type outArray = rec.rec_append_fields(tmpArray, name, oldArray[name].copy(), dtypes=typ) return outArray
def add_time_column(table, name='time', pop_start=True, pop_offset=True): """Append a column named 'time' by combining the gps_start and _offset Parameters ---------- table : `EventTable` table of events to modify name : `str`, optional name of field to append, default: 'time' pop_start: `bool`, optional remove the 'gps_start' field when finished, default: `True` pop_offset: `bool`, optional remove the 'gps_offset' field when finished, default: `True` Returns ------- mod : `recarray`, matches type of input a modified version of the input table with the new time field """ type_ = type(table) t = table['gps_start'] + table['gps_offset'] drop = [] if pop_start: drop.append('gps_start') if pop_offset: drop.append('gps_offset') if drop: table = recfunctions.rec_drop_fields(table, drop) return recfunctions.rec_append_fields(table, [name], [t]).view(type_)
def delete_field(self, name): """Delete field with name.""" if name not in self.dtype.names: raise ValueError( 'Currently, can only delete single names from {}.'.format( self.dtype.names)) new_array = rec_drop_fields(self, name) new = BoundStructArray(new_array, self.index_key, self._is_attr_of, keys_multicol=self._keys_multicol) setattr(self._is_attr_of[0], self._is_attr_of[1], new)
def _drop_object_col(rec, warn=True): # ignore columns of type `object` since PyTables does not support these if rec.dtype.hasobject: object_fields = [] fields = rec.dtype.fields for name in rec.dtype.names: if fields[name][0].kind == "O": object_fields.append(name) if warn: log.warning("ignoring unsupported object branch '{0}'".format(name)) # NumPy 1.7.1: TypeError: Cannot change data-type for object array. # return rec[non_object_fields] if object_fields: rec = recfunctions.rec_drop_fields(rec, object_fields) return rec
def _drop_object_col(rec, warn=True): # ignore columns of type `object` since PyTables does not support these if rec.dtype.hasobject: object_fields = [] fields = rec.dtype.fields for name in rec.dtype.names: if fields[name][0].kind == 'O': object_fields.append(name) if warn: log.warning( "ignoring unsupported object branch '{0}'".format( name)) # NumPy 1.7.1: TypeError: Cannot change data-type for object array. #return rec[non_object_fields] if object_fields: rec = recfunctions.rec_drop_fields(rec, object_fields) return rec
def _from_hdf_dataset(cls, dataset) -> "DropletTrack": """construct a droplet track by reading data from an hdf5 dataset Args: dataset: an HDF5 dataset from which the data of the droplet track is read """ # there are values, so the emulsion is not empty droplet_class = dataset.attrs["droplet_class"] obj = cls() if droplet_class == "None": return obj else: # separate time from the data set times = dataset["time"] droplet_data = rfn.rec_drop_fields(dataset, "time") for time, data in zip(times, droplet_data): droplet = droplet_from_data(droplet_class, data) obj.append(droplet, time=time) # type: ignore return obj
print target ''' for i in range(len(data)): if data[target][i] > (data[target].mean() + 2*data[target].std()) or data[target][i] < (data[target].mean() - 2*data[target].std()): delList = np.append(delList, i) print (data[target].mean() - 1*data[target].std()), data[target].std() ''' #clf = linear.BayesianRidge(verbose=True, alpha_1=2, alpha_2=2, lambda_1=.01, lambda_2=.01, fit_intercept=True, compute_score=True) #clf = linear.BayesianRidge(verbose=True) #clf = tree.DecisionTreeRegressor(max_depth=2) clf = svm.SVR(C=10000.0, kernel='rbf', degree=1) data = np.delete(data, delList, 0) data, testa, features, fillVal = util.prepDataTrain(data, target, featuresList, False, 20, False, True, 'mean', False, 'set') data = recfunctions.rec_drop_fields(data, delFeatures) #features = ['CTI','Depth', 'RELI', 'LSTN'] #an.plotData(np.sqrt(1+data['P']), data['ELEV']*(-1*data['TMAP'])) #data, clust, enc, newCol = clusterData(data, clusterFields, True) #testa, clust, enc, newCol = clusterData(testa, pickTest, True, enc, clust, False) #features = np.concatenate((features, newCol)) #Use/tune your predictor #clf.fit(data[features].tolist(), data[target]) #import pydot #dot_data = StringIO.StringIO() #tree.export_graphviz(clf, out_file=dot_data) #graph = pydot.graph_from_dot_data(dot_data.getvalue()) #graph.write_pdf("./ds.pdf")
def records(self, category=None, region=None, fields=None, cuts=None, include_weight=True, systematic='NOMINAL', scale=1., return_idx=False, **kwargs): from .ztautau import Ztautau if include_weight and fields is not None: if 'weight' not in fields: fields = list(fields) + ['weight'] selection = self.cuts(category, region, systematic) & cuts table_selection = selection.where() if systematic == 'NOMINAL': log.info("requesting table from %s" % (self.__class__.__name__)) else: log.info("requesting table from %s for systematic %s " % (self.__class__.__name__, systematic_name(systematic))) log.debug("using selection: %s" % selection) # TODO: handle cuts in weight expressions weight_branches = self.get_weight_branches(systematic, no_cuts=True) if systematic in SYSTEMATICS_BY_WEIGHT: systematic = 'NOMINAL' recs = [] if return_idx: idxs = [] for ds, _, sys_tables, sys_events, xs, kfact, effic in self.datasets: try: table = sys_tables[systematic] events = sys_events[systematic] except KeyError: log.debug( "table for %s not present for %s " "using NOMINAL" % (systematic, ds.name)) table = sys_tables['NOMINAL'] events = sys_events['NOMINAL'] actual_scale = self.scale if isinstance(self, Ztautau): if systematic == ('ZFIT_UP',): log.debug("scaling up for ZFIT_UP") actual_scale += self.scale_error elif systematic == ('ZFIT_DOWN',): log.debug("scaling down for ZFIT_DOWN") actual_scale -= self.scale_error weight = ( scale * actual_scale * LUMI[self.year] * xs * kfact * effic / events) # read the table with a selection try: if table_selection: rec = table.read_where(table_selection, **kwargs) else: rec = table.read(**kwargs) except Exception as e: print table print e continue #raise if return_idx: # only valid if table_selection is non-empty idx = table.get_where_list(table_selection, **kwargs) idxs.append(idx) # add weight field if include_weight: weights = np.empty(rec.shape[0], dtype='f8') weights.fill(weight) # merge the weight fields weights *= reduce(np.multiply, [rec[br] for br in weight_branches]) # drop other weight fields rec = recfunctions.rec_drop_fields(rec, weight_branches) # add the combined weight rec = recfunctions.rec_append_fields(rec, names='weight', data=weights, dtypes='f8') if rec['weight'].shape[0] > 1 and rec['weight'].sum() == 0: log.warning("{0}: weights sum to zero!".format(table.name)) if fields is not None: try: rec = rec[fields] except Exception as e: print table print rec.shape print rec.dtype print e raise recs.append(rec) if return_idx: return zip(recs, idxs) return recs
def read_file_prune_fields_clean_values(infile_name, x_name, y_name): data = np.recfromcsv(infile_name, delimiter=',') data = retain_relevant_fields(data) data = data[data[y_name] != -1] #this takes care of the garbage rows return data[y_name], data[x_name], rf.rec_drop_fields( data, [y_name, x_name])
def get_triggers(channel, etg, segments, cache=None, snr=None, frange=None, columns=None, raw=False, **kwargs): """Get triggers for the given channel """ # get table from etg try: Table = TABLE[etg.lower()] except KeyError as e: e.args = ('Unknown ETG %r, cannot map to LIGO_LW Table class' % etg,) raise tablename = strip_table_name(Table.tableName) # get default columns for this table if columns is None: for key in COLUMNS: if issubclass(Table, key): columns = COLUMNS[key][:] break if 'channel' in columns: columns.pop('channel') # find triggers if cache is None: cache = find_trigger_files(channel, etg, segments, **kwargs) # read cache trigs = lsctables.New(Table, columns=columns) cache = cache.unique() cache.sort(key=lambda x: x.segment[0]) for segment in segments: if len(cache.sieve(segment=segment)): if tablename.endswith('_inspiral'): filt = lambda t: float(t.get_end()) in segment else: filt = lambda t: float(t.get_peak()) in segment trigs.extend(Table.read(cache.sieve(segment=segment), filt=filt)) # format table as numpy.recarray recarray = trigs.to_recarray(columns=columns) # filter if snr is not None: recarray = recarray[recarray['snr'] >= snr] if tablename.endswith('_burst') and frange is not None: recarray = recarray[ (recarray['peak_frequency'] >= frange[0]) & (recarray['peak_frequency'] < frange[1])] # return basic table if 'raw' if raw: return recarray # otherwise spend the rest of this function converting functions to # something useful for the hveto core analysis addfields = {} dropfields = [] # append channel to all events columns.append('channel') addfields['channel'] = numpy.repeat(channel, recarray.shape[0]) # rename frequency column if tablename.endswith('_burst'): recarray = recfunctions.rename_fields( recarray, {'peak_frequency': 'frequency'}) idx = columns.index('peak_frequency') columns.pop(idx) columns.insert(idx, 'frequency') # map time to its own column if tablename.endswith('_inspiral'): tcols = ['end_time', 'end_time_ns'] elif tablename.endswith('_burst'): tcols = ['peak_time', 'peak_time_ns'] else: tcols = None if tcols: times = recarray[tcols[0]] + recarray[tcols[1]] * 1e-9 addfields['time'] = times dropfields.extend(tcols) columns = ['time'] + columns[2:] # add and remove fields as required if addfields: names, data = zip(*addfields.items()) recarray = recfunctions.rec_append_fields(recarray, names, data) recarray = recfunctions.rec_drop_fields(recarray, dropfields) return recarray[columns]
def drop_column(self, key): self.d = npr.rec_drop_fields(self.d, key) del self.scale[key] del self.unit[key]
def filter_events( self, img_heights_px, drop_events_on_top=False, drop_events_on_bottom=False, drop_longer_and_shorter=False, drop_positive=False, drop_negative=False, force_keep={}, force_drop={}, force_position={}, obsolete_regions={}, ): """ Filter out wrongly detected events: - events shorted than 0.9 or longer than 1.1 of the median event length - events starting on the image top (partial events) - events ending on the image bottom (partial events) - events in override_bad Fix events starts according to override_start. :param img_heights_px: image height for all cameras, {cam: height_px, ... } :param drop_events_on_top: drop events starting on the top (first row or first row after obsolete region) :param drop_events_on_bottom: drop events ending at the bottom (last row or last row before obsolete region) :param drop_longer_and_shorter: drop events of nonstandard length, apply only to events that are not split :param force_keep: force events to NOT BE filtered, specify events by a record array with frame, position and positivity combination, e.g. {cam: [(frame, positive), (frame, positive)...], cam: ... } :param force_drop: force events to BE filtered, {cam: [(frame, position_px), (frame, position_px), ...], cam: ... } :param force_position: override event position, {cam: [(frame, horizontal position in px), ...], cam: ... } :param obsolete_regions: ignored stripes on the top and/or image bottom, {cam: {'top': top_px, 'bottom': bot_px}, ...} """ # compute median event length in px for the cameras with the same img height heights_px = set(img_heights_px.values()) median_event_length_px = {} mask_events_not_split = {} events = {} for cam in self.events.keys(): cam_events = self.events[cam] override_bad_mask = self.__queries2mask__( cam_events, force_drop[cam] if cam in force_drop else None) override_good_mask = self.__queries2mask__( cam_events, force_keep[cam] if cam in force_keep else None) # filter out events mask_bad = np.zeros(len(cam_events), dtype=bool) if drop_events_on_top: mask_bad |= (cam_events["position_px"] <= obsolete_regions[cam] ["top"]) & cam_events["positive"] if drop_events_on_bottom: mask_bad |= (cam_events["position_px"] >= img_heights_px[cam] - 1) & ~cam_events["positive"] if drop_positive: mask_bad |= cam_events["positive"] if drop_negative: mask_bad |= ~cam_events["positive"] # if drop_longer_and_shorter: # # apply filter only to the events that are not split (naturally shortened) # event_length_px = median_event_length_px[img_heights_px[cam]] # mask_bad |= mask_events_not_split[cam] & \ # (((cam_events['end'] - cam_events['position_px']) < event_length_px * 0.9) | # ((cam_events['end'] - cam_events['position_px']) > event_length_px * 1.1)) mask_bad |= override_bad_mask # force events to stay events[cam] = cam_events[~mask_bad | override_good_mask] # override event position if force_position and cam in force_position: for row in force_position[cam]: query = rec_drop_fields(force_position[cam], ["position_px"]) idxs = np.nonzero(self.__queries2mask__( events[cam], query))[0] if len(idxs) == 0: logging.warning("force_position can" "t find a matching event: %s" % str(query)) elif len(idxs) > 1: logging.warning( "force_position ambiguous match for query: %s" % str(query)) else: events[cam][ idxs[0]]["position_px"] = row["position_px"] self.events = events
def __setitem__(self, keys, values): """Either a single one- or multi-column or mulitiple one-colum items.""" if isinstance(values, pd.Series): values = values.values if values.dtype.char == 'O' or 'int' in values.dtype.name: values = values.astype(str) names_to_remove = [] if isinstance(keys, str): # TODO: check that no-one accidentally overwrites the index? # quite unlikely though as self.index_key is not common # if keys == self.index_key: # raise ValueError('The key {} is reserved for the index in BoundStructArray. ' # .format(self.index_key)) keys = [keys] # check if values is nested, if not, it's not multicolumn if (not hasattr(values[0], '__len__') or len( values[0] ) == 1 # seems that we do not need this, as the previous line matches already or np.array(values[0]).dtype.char in {'S', 'U'}): # a string is passed values = [values] else: # otherwise it's a multicolumn key key_multicol = keys[0] if keys[0] not in self._keys_multicol: self._keys_multicol += [key_multicol] self._keys += [key_multicol] # generate single-column keys keys = _gen_keys_from_key_multicol(key_multicol, len(values[0])) self._keys_multicol_lookup[key_multicol] = keys # remove all fields from the array that are not among keys keys_set = set(keys) for name in self.dtype.names: if name.startswith(key_multicol) and name not in keys_set: names_to_remove.append(name) values = np.array(values) if values.shape[0] == self.shape[0]: values = values.T else: raise ValueError( 'You provided an array with {} rows but it need' 'to have {}.'.format(values.shape[0], self.shape[0])) else: values = np.array( values) # sequence of arrays or matrix with n_keys *rows* if values.shape[0] == self.shape[0]: values = values.T else: raise ValueError( 'You provided an array with {} rows but it need' 'to have {}.'.format(values.shape[0], self.shape[0])) keys = np.array(keys) values = np.array( values) # sequence of arrays or matrix with n_keys *rows* # update keys for key in keys: if (key != self.index_key and key not in self._keys and _key_belongs_to_which_key_multicol( key, self._keys_multicol) < 0): self._keys += [key] if len(keys) != len(values): print(keys, values) raise ValueError( 'You passed {} column keys but {} arrays as columns. ' 'If you passed a matrix instead of a sequence ' 'of arrays, try transposing it.'.format( len(keys), len(values))) if values.shape[1] != self.shape[0]: raise ValueError('You want to add a column with {} rows ' 'but it need to have {} rows.'.format( values.shape[1], self.shape[0])) if values.dtype.char in {'U', 'S'}: try: itemsize = values.dtype.itemsize if values.dtype.char == 'U': itemsize /= 4 if itemsize > np.dtype(STRING_TYPE).itemsize: logg.m('WARNING: truncating strings to length {}'.format( np.dtype(STRING_TYPE).itemsize)) values = values.astype(STRING_TYPE) except UnicodeEncodeError: raise ValueError( 'Currently only support ascii strings. Don\'t use "ö" etc. for sample annotation.' ) present = np.intersect1d(keys, self.dtype.names) absent = np.setdiff1d(keys, self.dtype.names) if any(present): for k, v in zip(present, values[np.in1d(keys, present)]): if (v.dtype != self.dtype[k] and v.dtype.itemsize > self.dtype[k].itemsize): # TODO: need to reallocate memory # or allow storing objects, or use pd.dataframes raise SetKeyError(k, v.dtype, self.dtype[k]) super(BoundStructArray, self).__setitem__(k, v) if any(absent): if values.shape[1] > len(self): raise ValueError( 'New column has too many entries ({} > {})'.format( values.shape[1], len(self))) source = append_fields(self, absent, values[np.in1d(keys, absent)], usemask=False, asrecarray=True) if names_to_remove: source = rec_drop_fields(source, names_to_remove) new = BoundStructArray(source, self.index_key, self._is_attr_of, keys_multicol=self._keys_multicol) setattr(self._is_attr_of[0], self._is_attr_of[1], new)