def create_tensor(self, json_data): recieved_data = json_data print(recieved_data) decibels = [] long = [] lat = [] timestamp = [] ids = [] date_time = [] fft_data = [] frequens = [] for id, sensor in enumerate(recieved_data): for dd in sensor["data"]: decibels.append(dd["decibels"]) ids.append(id) lat.append(sensor["location"][0]) long.append(sensor["location"][1]) timestr = str(dd["measured_at"]) timestr = timestr.replace("T", " ") timestr = timestr.replace("Z", "") time = datetime.strptime(timestr, '%Y-%m-%d %H:%M:%S.%f') timestamp.append(time.timestamp()) date_time.append(str(time)) fft_data.append(dd["fftValues"]) frekvence = [ math.floor(((i + 0.5) / len(dd["fftValues"])) * dd["frequencyRange"]) for i in range(len(dd["fftValues"])) ] frequens = frekvence time_var = TimeVariable() to_zip_td = [time_var.parse(i) for i in date_time] con_domena = [ ContinuousVariable("decibels"), ContinuousVariable("sensor_id"), ContinuousVariable("longitude"), ContinuousVariable("lattitude"), ContinuousVariable("timestamp") ] for ff in frequens: con_domena.append(ContinuousVariable(str(ff))) fft_data = list(map(list, zip(*fft_data))) transformed_data = Table.from_list( Domain(con_domena, [TimeVariable('datetime')]), list(zip(decibels, ids, long, lat, timestamp, *fft_data, to_zip_td))) return transformed_data
def create_matrix(self, json_data): recieved_data = json_data # print(recieved_data) data_array = defaultdict(list) timestamps_dict = defaultdict(int) for i in json_data: print(i["sensor"]) for d in i["data"]: data_array[i["sensor"]].append([d["measured_at"], d["decibels"]]) timestamps_dict[d["measured_at"]] += 1 print(data_array, timestamps_dict) for k in timestamps_dict.keys(): timestamps_dict[k] = [0 for i in range(len(data_array))] for n, sensor_key in enumerate(data_array.keys()): sensor_data = data_array[sensor_key] print(sensor_data) for measurement in sensor_data: timestamps_dict[measurement[0]][n] = measurement[1] final_array = [] for time_key in timestamps_dict.keys(): timestr = time_key timestr = timestr.replace("T", " ") timestr = timestr.replace("Z", "") time_var = TimeVariable() time = time_var.parse(str(datetime.strptime(timestr, '%Y-%m-%d %H:%M:%S.%f'))) temp = [time] temp += timestamps_dict[time_key] final_array.append(temp) con_domena = [] for i in range(len(data_array)): con_domena.append(ContinuousVariable("sensor_" + str(i))) transformed_data = Table.from_list( Domain([TimeVariable('datetime')], con_domena),final_array) return transformed_data
def test_have_date(self): var = TimeVariable("time") ts = var.parse("1937-08-02") # parse date self.assertEqual(var.repr_val(ts), "1937-08-02") ts = var.parse("16:20") # parse time # observe have datetime self.assertEqual(var.repr_val(ts), "1970-01-01 16:20:00")
def _run(self): df = self.df updated = False # collect data into df for inlet in self.streams: # pull samples samples, timestamps = inlet.pull_chunk() if len(timestamps) == 0: continue # merge samples to data frame info = inlet.info() key = f'{info.name()} {info.type()}' labels = self.ch_map[key] samples = np.array(samples).reshape((-1, len(labels))) chunk = pd.DataFrame(index=pd.Index(timestamps, name='t'), data=samples, columns=labels) # generate updated chunk of data df = chunk.reindex(index=df.index.union(chunk.index).sort_values(), columns=df.columns).fillna(df) updated = True # retain only the last ${buffer_size} rows self.df = df.iloc[-self.buffer_size:] # emit the aggregate data if updated: # create table from self._data (include index as well) table_data = np.concatenate([self.df.index.to_numpy().reshape((-1, 1)), self.df.to_numpy()], axis=-1) table = Table.from_numpy(Domain([TimeVariable(self.df.index.name), *map(ContinuousVariable, self.df.columns)]), table_data) # trigger callback self.callback(table)
def test_hash(self): a = ContinuousVariable("a") b = ContinuousVariable("a") self.assertEqual(hash(a), hash(b)) a._compute_value = lambda x: x self.assertNotEqual(hash(a), hash(b)) b._compute_value = lambda x: x self.assertNotEqual(hash(a), hash(b)) a1 = ContinuousVariable("a") a2 = ContinuousVariable("a") a._compute_value = Identity(a1) self.assertNotEqual(hash(a), hash(b)) b._compute_value = Identity(a2) self.assertEqual(hash(a), hash(b)) at = TimeVariable("a") b = ContinuousVariable("b") self.assertEqual(hash(a1), hash(a2)) self.assertNotEqual(hash(a1), hash(b)) self.assertNotEqual(hash(a1), hash(at))
def prepare_data(self): self.domain = Domain([ ContinuousVariable("x"), ContinuousVariable("y"), ContinuousVariable("z"), TimeVariable("t"), TimeVariable("u") ]) self.data = Table.from_numpy(self.domain, np.arange(20).reshape(4, 5)) self.var_hints = { DefaultKey: VarHint(Methods.Keep, ()), ("x", False): VarHint(Methods.EqualFreq, (3, )), ("y", False): VarHint(Methods.Keep, ()), ("z", False): VarHint(Methods.Remove, ()), ("t", True): VarHint(Methods.Binning, (2, )) }
def setUpClass(cls) -> None: super().setUpClass() domain = Domain([ DiscreteVariable("A", values=("a", "b", "c")), DiscreteVariable("B", values=("0", "1", "2")), ContinuousVariable("C"), TimeVariable("D", have_time=True), ], metas=[ StringVariable("S") ]) cls.data = Table.from_list( domain, [ [0, 2, 0.25, 180], [1, 1, 1.25, 360], [2, 0, 0.20, 720], [1, 0, 0.00, 000], ] ) cls.data_str = Table.from_list( Domain([], [], metas=[ StringVariable("S"), StringVariable("T") ]), [["0.1", "2010"], ["1.0", "2020"]] )
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, name=None): assert issubclass(coltype, Variable) def get_number_of_decimals(values): len_ = len ndecimals = max((len_(value) - value.find(".") for value in values if "." in value), default=1) return ndecimals - 1 if issubclass(coltype, DiscreteVariable) and valuemap is not None: coltype_kwargs.update(values=valuemap) var = coltype.make(name, **coltype_kwargs) if isinstance(var, DiscreteVariable): # Map discrete data to 'ints' (or at least what passes as int around # here) mapping = defaultdict( lambda: np.nan, {val: i for i, val in enumerate(var.values)}, ) mapping[""] = np.nan mapvalues_ = np.frompyfunc(mapping.__getitem__, 1, 1) def mapvalues(arr): arr = np.asarray(arr, dtype=object) return mapvalues_(arr, out=np.empty_like(arr, dtype=float), casting="unsafe") values = mapvalues(orig_values) if coltype is StringVariable: values = orig_values # ContinuousVariable.number_of_decimals is supposed to be handled by # ContinuousVariable.to_val. In the interest of speed, the reader bypasses # it, so we set the number of decimals here. # The number of decimals is increased if not set manually (in which case # var.adjust_decimals would be 0). if isinstance(var, ContinuousVariable) and var.adjust_decimals: ndecimals = get_number_of_decimals(orig_values) if var.adjust_decimals == 2 or ndecimals > var.number_of_decimals: var.number_of_decimals = ndecimals var.adjust_decimals = 1 if isinstance(var, TimeVariable) or coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one _var = var if isinstance(var, TimeVariable) else TimeVariable('_') values = [_var.parse(i) for i in orig_values] return values, var
def setUpClass(cls): cls.disc = DiscreteVariable("gender", values=("M", "F")) cls.cont = ContinuousVariable("age") cls.string = StringVariable("name") cls.time = TimeVariable("birth") cls.model = VariableListModel( [cls.cont, None, "Foo", cls.disc, cls.string, cls.time])
def guess_data_type(orig_values, namask=None): """ Use heuristics to guess data type. """ valuemap, values = None, orig_values is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) coltype = DiscreteVariable else: # try to parse as float orig_values = np.asarray(orig_values) if namask is None: namask = isnastr(orig_values) values = np.empty_like(orig_values, dtype=float) values[namask] = np.nan try: np.copyto(values, orig_values, where=~namask, casting="unsafe") except ValueError: tvar = TimeVariable('_') try: values[~namask] = [tvar.parse(i) for i in orig_values[~namask]] except ValueError: coltype = StringVariable # return original_values values = orig_values else: coltype = TimeVariable else: coltype = ContinuousVariable return valuemap, values, coltype
def test_parse_repr(self): for datestr, timestamp, outstr in self.TESTS: var = TimeVariable('time') ts = var.to_val(datestr) # calls parse for strings if not np.isnan(ts): self.assertEqual(ts, timestamp, msg=datestr) self.assertEqual(var.repr_val(ts), outstr, msg=datestr)
def test_have_date(self): var = TimeVariable('time') ts = var.parse('1937-08-02') # parse date self.assertEqual(var.repr_val(ts), '1937-08-02') ts = var.parse('16:20') # parse time # observe have datetime self.assertEqual(var.repr_val(ts), '1970-01-01 16:20:00')
def get_variable(self, field_name, type_code, inspect_values=False): FLOATISH_TYPES = (700, 701, 1700) # real, float8, numeric INT_TYPES = (20, 21, 23) # bigint, int, smallint CHAR_TYPES = (25, 1042, 1043,) # text, char, varchar BOOLEAN_TYPES = (16,) # bool DATE_TYPES = (1082, 1114, 1184, ) # date, timestamp, timestamptz # time, timestamp, timestamptz, timetz TIME_TYPES = (1083, 1114, 1184, 1266,) if type_code in FLOATISH_TYPES: return ContinuousVariable(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_values: values = self.get_distinct_values(field_name) if values: return DiscreteVariable(field_name, values) return ContinuousVariable(field_name) if type_code in BOOLEAN_TYPES: return DiscreteVariable(field_name, ['false', 'true']) if type_code in CHAR_TYPES: if inspect_values: values = self.get_distinct_values(field_name) if values: return DiscreteVariable(field_name, values) return StringVariable(field_name)
def _date_to_iso(date): possible_date_formats = [ '%Y %b %d', '%Y %b', '%Y', ] season_mapping = { 'fall': 'Sep', 'autumn': 'Sep', 'winter': 'Dec', 'spring': 'Mar', 'summer': 'Jun', } date = date.lower() # Seasons to their respective months. for season, month in season_mapping.items(): date = date.replace(season, month) date = date.split('-')[0] # 2015 Sep-Dec --> 2015 Sep time_var = TimeVariable() for date_format in possible_date_formats: try: date_string = datetime.strptime(date, date_format).date().isoformat() return time_var.parse(date_string) except ValueError: continue # Try the next format. warnings.warn('Could not parse "{}" into a date.'.format(date), RuntimeWarning) return time_var.parse(np.nan)
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata) meta_vars = [] for field_name, _ in includes_metadata: if field_name == 'pub_date': meta_vars.append(TimeVariable(field_name)) else: meta_vars.append(StringVariable.make(field_name)) class_vars = [ DiscreteVariable('section_name', values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def setUp(self): domain = Domain([ DiscreteVariable("d", values=tuple("abc")), ContinuousVariable("c"), TimeVariable("t") ], [], [StringVariable("s")]) n = np.nan self.data = Table(domain, np.array([[1, n, 15], [n, 42, n]]), np.empty((2, 0)), np.array([["foo"], [""]]))
def test_parse_utc(self): var = TimeVariable('time') datestr, offset = '2015-10-18 22:48:20', '+0200' ts1 = var.parse(datestr + offset) self.assertEqual(var.repr_val(ts1), datestr + offset) # Once a value is without a TZ, all the values lose it ts2 = var.parse(datestr) self.assertEqual(var.repr_val(ts2), datestr) self.assertEqual(var.repr_val(ts1), '2015-10-18 20:48:20')
def run(self) -> None: if self.job is not None: [t, x_norm_pos, y_norm_pos] = self.job fxn = [] for i in range(len(x_norm_pos)): fxn.append(self.next(x_norm_pos[i], y_norm_pos[i])) fxn = np.array(fxn) table = Table.from_numpy(Domain([TimeVariable('t'), ContinuousVariable('fxn')]), np.stack([t, fxn], axis=-1)) print(table) self.output.emit(table)
def time_vals(self, column): column_data = [row[column] for row in self._table] try: tvar = TimeVariable("_") values = [ tvar.parse_exact_iso(d) if d is not None else None for d in column_data ] return values except ValueError: return None
def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, domain_vars, existing_var, new_var_name, data=None): if valuemap: # Map discrete data to ints def valuemap_index(val): try: return valuemap.index(val) except ValueError: return np.nan values = np.vectorize(valuemap_index, otypes=[float])(orig_values) coltype_kwargs.update(values=valuemap) if coltype is StringVariable: values = ['' if i is np.nan else i for i in orig_values] var = None if domain_vars is not None: if existing_var: # Use existing variable if available var = coltype.make(existing_var.strip(), **coltype_kwargs) else: # Never use existing for un-named variables var = coltype(new_var_name, **coltype_kwargs) # Reorder discrete values to match existing variable if var.is_discrete and not var.ordered: new_order, old_order = var.values, coltype_kwargs.get( 'values', var.values) if new_order != old_order: offset = len(new_order) column = values if data.ndim > 1 else data column += offset for i, val in enumerate(var.values): try: oldval = old_order.index(val) except ValueError: continue bn.replace(column, offset + oldval, new_order.index(val)) if isinstance(var, TimeVariable) or coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one _var = var if isinstance(var, TimeVariable) else TimeVariable('_') values = [_var.parse(i) for i in orig_values] return values, var
def create_matrix(self, json_data): recieved_data = json_data print(recieved_data) n_sensors = len(recieved_data) final_data = [defaultdict(int) for i in range(n_sensors)] for id, sensor in enumerate(recieved_data): for dd in sensor["data"]: timestr = str(dd["measured_at"]) timestr = timestr.replace("T", " ") timestr = timestr.replace("Z", "") time_var = TimeVariable() time = time_var.parse( str(datetime.strptime(timestr, '%Y-%m-%d %H:%M:%S.%f'))) for n, defdic in enumerate(final_data): defdic[time] += 0 if n == id: defdic[time] = dd["decibels"] data_matrix = [] for key in final_data[0].keys(): temp = [] temp.append(key) for n, sens_data in enumerate(final_data): temp.append(sens_data[key]) data_matrix.append(temp) con_domena = [] for i in range(n_sensors): con_domena.append(ContinuousVariable("sensor_" + str(i))) transformed_data = Table.from_list( Domain([TimeVariable('datetime')], con_domena), data_matrix) return transformed_data
def test_hash(self): v = 1234.5 val = Value(ContinuousVariable("var"), v) self.assertTrue(val == v and hash(val) == hash(v)) v = "test" val = Value(StringVariable("var"), v) self.assertTrue(val == v and hash(val) == hash(v)) v = 1234.5 val = Value(TimeVariable("var"), v) self.assertTrue(val == v and hash(val) == hash(v)) val = Value(DiscreteVariable("var", ["red", "green", "blue"]), 1) self.assertRaises(TypeError, hash, val)
def test_no_date_no_time(self): callback = Mock() editor = TimeVariableEditor(self.parent, TimeVariable("var"), callback) self.assertEqual(editor.value, 0) self.assertEqual(self.editor._edit.dateTime(), _datetime(1970, 1, 1)) self.callback.assert_not_called() datetime = QDateTime(QDate(2001, 9, 9), QTime(1, 2, 3)) editor._edit.setDateTime(datetime) self.assertEqual(editor._edit.dateTime(), datetime) self.assertEqual(editor.value, 999993600 + 3723) callback.assert_called_once()
def test_column_str_repr(self): v = StringVariable("S") d = column_str_repr(v, np.array(["A", "", "B"])) assert_array_equal(d, ["A", "?", "B"]) v = ContinuousVariable("C") d = column_str_repr(v, np.array([0.1, np.nan, 1.0])) assert_array_equal(d, ["0.1", "?", "1"]) v = DiscreteVariable("D", ("a", "b")) d = column_str_repr(v, np.array([0., np.nan, 1.0])) assert_array_equal(d, ["a", "?", "b"]) v = TimeVariable("T", have_date=False, have_time=True) d = column_str_repr(v, np.array([0., np.nan, 1.0])) assert_array_equal(d, ["00:00:00", "?", "00:00:01"])
def table_from_frame(df, *, force_nominal=False): def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666)) def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: pass return False attrs, metas = [], [] X, M = [], [] # If df index is not a simple RangeIndex (or similar), put it into data if not (df.index.is_integer() and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)): df = df.reset_index() for name, s in df.items(): name = str(name) if _is_discrete(s): discrete = s.astype('category').cat attrs.append( DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_datetime(s): tvar = TimeVariable(name) attrs.append(tvar) s = pd.to_datetime(s, infer_datetime_format=True) X.append( s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) elif is_numeric_dtype(s): attrs.append(ContinuousVariable(name)) X.append(s.values) else: metas.append(StringVariable(name)) M.append(s.values.astype(object)) MAX_LENGTH = max(len(X[0]) if X else 0, len(M[0]) if M else 0) return Table.from_numpy( Domain(attrs, None, metas), np.column_stack(X) if X else np.empty((MAX_LENGTH, 0)), None, np.column_stack(M) if M else None)
def generateFeatures(in_data): new_domain = Domain(['TIMESTAMP','BYTES',TimeVariable('DATETIME'),ContinuousVariable('FEAT_URL_0'), ContinuousVariable('FEAT_URL_1'), ContinuousVariable('FEAT_URL_2'), ContinuousVariable('FEAT_URL_3'), ContinuousVariable('FEAT_URL_4'), ContinuousVariable('FEAT_URL_5'), ContinuousVariable('FEAT_URL_6'),ContinuousVariable('FEAT_USER_1'),ContinuousVariable('FEAT_USER_AGENT_1'),ContinuousVariable('FEAT_CATEGORIES_1')],class_vars=in_data.domain.class_var,metas=in_data.domain.metas,source=in_data.domain) new_table=Table(new_domain,in_data) #print (("Numbers of rows %s")%(len(in_data))) for index_d,data in enumerate(new_table): for index_v,value in enumerate(url_vector(str(data['URL']))): data['FEAT_URL_'+str(index_v)]=value data['DATETIME']=datetime.datetime.fromtimestamp(data['TIMESTAMP']).strftime('%Y-%m-%d %H:%M:%S') data['DOMAIN']=url_split(str(data['URL']))[2] data['FEAT_USER_1']=pseudo_hash(data['USER']) data['FEAT_USER_AGENT_1']=pseudo_hash(data['USER_AGENT']) data['FEAT_CATEGORIES_1']=pseudo_hash(data['CATEGORIES']) return new_table
def __getitem__(self, key): if not self: for tpe, char, col in ((vartype(ContinuousVariable("c")), "N", (202, 0, 32)), (vartype(DiscreteVariable("d")), "C", (26, 150, 65)), (vartype(StringVariable("s")), "S", (0, 0, 0)), (vartype(TimeVariable("t")), "T", (68, 170, 255)), (-1, "?", (128, 128, 128))): self[tpe] = createAttributePixmap(char, QtGui.QColor(*col)) if key not in self: key = vartype(key) if isinstance(key, Variable) else -1 return super().__getitem__(key)
def test_have_time(self): callback = Mock() editor = TimeVariableEditor(self.parent, TimeVariable("var", have_time=1), callback) self.assertEqual(editor.value, 0) self.assertEqual(self.editor._edit.dateTime(), QDateTime(QDate(1970, 1, 1), QTime(0, 0, 0))) self.callback.assert_not_called() datetime = QDateTime(QDate(1900, 1, 1), QTime(1, 2, 3)) editor._edit.setDateTime(datetime) self.assertEqual(editor._edit.dateTime(), datetime) self.assertEqual(editor.value, 3723) callback.assert_called_once()
def create_domain(*ss): vars = dict( age=ContinuousVariable(name="AGE"), gender=DiscreteVariable(name="Gender", values=("M", "F")), incomeA=ContinuousVariable(name="incomeA"), income=ContinuousVariable(name="income"), education=DiscreteVariable(name="education", values=("GS", "HS", "C")), ssn=StringVariable(name="SSN"), race=DiscreteVariable(name="race", values=("White", "Hypsanic", "African", "Other")), arrival=TimeVariable("arrival")) def map_vars(s): return [vars[x] for x in s] return Domain(*[map_vars(s) for s in ss])
def get_domain(self): if self._domain is not None: return self._domain vars = [] for ci in range(self.columnCount()): if self.is_discrete(ci): values = set(row[ci] for row in self._table if row[ci] is not None) var = DiscreteVariable(name=str(ci + 1), values=values) elif self.is_time_variable(ci): var = TimeVariable(name=str(ci + 1)) else: var = ContinuousVariable(name=str(ci + 1)) vars.append(var) return Domain(vars)