def guess_data_type(orig_values, namask=None): """ Use heuristics to guess data type. """ valuemap, values = None, orig_values is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) coltype = DiscreteVariable else: # try to parse as float orig_values = np.asarray(orig_values) if namask is None: namask = isnastr(orig_values) values = np.empty_like(orig_values, dtype=float) values[namask] = np.nan try: np.copyto(values, orig_values, where=~namask, casting="unsafe") except ValueError: tvar = TimeVariable('_') try: values[~namask] = [tvar.parse(i) for i in orig_values[~namask]] except ValueError: coltype = StringVariable # return original_values values = orig_values else: coltype = TimeVariable else: coltype = ContinuousVariable return valuemap, values, coltype
def test_parse_repr(self): for datestr, timestamp, outstr in self.TESTS: var = TimeVariable('time') ts = var.to_val(datestr) # calls parse for strings if not np.isnan(ts): self.assertEqual(ts, timestamp, msg=datestr) self.assertEqual(var.repr_val(ts), outstr, msg=datestr)
def test_have_date(self): var = TimeVariable('time') ts = var.parse('1937-08-02') # parse date self.assertEqual(var.repr_val(ts), '1937-08-02') ts = var.parse('16:20') # parse time # observe have datetime self.assertEqual(var.repr_val(ts), '1970-01-01 16:20:00')
def test_parse_utc(self): var = TimeVariable('time') datestr, offset = '2015-10-18 22:48:20', '+0200' ts1 = var.parse(datestr + offset) self.assertEqual(var.repr_val(ts1), datestr + offset) # Once a value is without a TZ, all the values lose it ts2 = var.parse(datestr) self.assertEqual(var.repr_val(ts2), datestr) self.assertEqual(var.repr_val(ts1), '2015-10-18 20:48:20')
def _guess_variable(self, field_name, field_metadata, inspect_table): from pymssql import STRING, NUMBER, DATETIME, DECIMAL type_code, *_ = field_metadata if type_code in (NUMBER, DECIMAL): return ContinuousVariable(field_name) if type_code == DATETIME: tv = TimeVariable(field_name) tv.have_date = True tv.have_time = True return tv if type_code == STRING: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable(field_name, values) return StringVariable(field_name)
def guess_data_type(orig_values): """ Use heuristics to guess data type. """ valuemap, values = [], orig_values is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) coltype = DiscreteVariable else: try: values = [float(i) for i in orig_values] except ValueError: tvar = TimeVariable('_') try: values = [tvar.parse(i) for i in orig_values] except ValueError: coltype = StringVariable else: coltype = TimeVariable else: coltype = ContinuousVariable return valuemap, values, coltype
def _date_to_iso(date): possible_date_formats = [ '%Y %b %d', '%Y %b', '%Y', ] season_mapping = { 'fall': 'Sep', 'autumn': 'Sep', 'winter': 'Dec', 'spring': 'Mar', 'summer': 'Jun', } date = date.lower() # Seasons to their respective months. for season, month in season_mapping.items(): date = date.replace(season, month) date = date.split('-')[0] # 2015 Sep-Dec --> 2015 Sep time_var = TimeVariable() for date_format in possible_date_formats: try: date_string = datetime.strptime( date, date_format ).date().isoformat() return time_var.parse(date_string) except ValueError: continue # Try the next format. warnings.warn( 'Could not parse "{}" into a date.'.format(date), RuntimeWarning ) return time_var.parse(np.nan)
def transpose_table(table): """ Transpose the rows and columns of the table. Args: table: Data in :obj:`Orange.data.Table` Returns: Transposed :obj:`Orange.data.Table`. (Genes as columns) """ attrs = table.domain.attributes attr = [ContinuousVariable.make(ex['Gene'].value) for ex in table] # Set metas new_metas = [StringVariable.make(name) if name is not 'Time' else TimeVariable.make(name) for name in sorted(table.domain.variables[0].attributes.keys())] domain = Domain(attr, metas=new_metas) meta_values = [[exp.attributes[var.name] for var in domain.metas] for exp in attrs] return Table(domain, table.X.transpose(), metas=meta_values)
def _guess_variable(self, field_name, field_metadata, inspect_table): type_code = field_metadata[0] FLOATISH_TYPES = (700, 701, 1700) # real, float8, numeric INT_TYPES = (20, 21, 23) # bigint, int, smallint CHAR_TYPES = (25, 1042, 1043,) # text, char, varchar BOOLEAN_TYPES = (16,) # bool DATE_TYPES = (1082, 1114, 1184, ) # date, timestamp, timestamptz # time, timestamp, timestamptz, timetz TIME_TYPES = (1083, 1114, 1184, 1266,) if type_code in FLOATISH_TYPES: return ContinuousVariable.make(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable.make(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable.make(field_name, values) return ContinuousVariable.make(field_name) if type_code in BOOLEAN_TYPES: return DiscreteVariable.make(field_name, ['false', 'true']) if type_code in CHAR_TYPES: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) # remove trailing spaces values = [v.rstrip() for v in values] if values: return DiscreteVariable.make(field_name, values) return StringVariable.make(field_name)
def read(self): try: import opusFC except ImportError: raise RuntimeError(self._OPUS_WARNING) if self.sheet: db = self.sheet else: db = self.sheets[0] db = tuple(db.split(" ")) dim = db[1] try: data = opusFC.getOpusData(self.filename, db) except Exception: raise IOError("Couldn't load spectrum from " + self.filename) attrs, clses, metas = [], [], [] attrs = [ContinuousVariable.make(repr(data.x[i])) for i in range(data.x.shape[0])] y_data = None meta_data = None if type(data) == opusFC.MultiRegionDataReturn: y_data = [] meta_data = [] metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region'), TimeVariable.make('start_time')]) for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) start_time = region.start_time meta_region = np.column_stack((mapX, mapY, map_region, start_time)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.MultiRegionTRCDataReturn: y_data = [] meta_data = [] metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region')]) attrs = [ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels))] for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) meta_region = np.column_stack((mapX, mapY, map_region)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.ImageDataReturn: metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y')]) data_3D = data.spectra for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.ImageTRCDataReturn: metas.extend([ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y')]) attrs = [ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels))] data_3D = data.traces for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.TimeResolvedTRCDataReturn: y_data = data.traces elif type(data) == opusFC.TimeResolvedDataReturn: metas.extend([ContinuousVariable.make('z')]) y_data = data.spectra meta_data = data.z elif type(data) == opusFC.SingleDataReturn: y_data = data.y[None, :] else: raise ValueError("Empty or unsupported opusFC DataReturn object: " + type(data)) import_params = ['SRT', 'SNM'] for param_key in import_params: try: param = data.parameters[param_key] except KeyError: pass # TODO should notify user? else: try: param_name = opusFC.paramDict[param_key] except KeyError: param_name = param_key if param_key == 'SRT': var = TimeVariable.make(param_name) elif type(param) is float: var = ContinuousVariable.make(param_name) elif type(param) is str: var = StringVariable.make(param_name) else: raise ValueError #Found a type to handle metas.extend([var]) params = np.full((y_data.shape[0],), param, np.array(param).dtype) if meta_data is not None: # NB dtype default will be np.array(fill_value).dtype in future meta_data = np.column_stack((meta_data, params.astype(object))) else: meta_data = params domain = Orange.data.Domain(attrs, clses, metas) meta_data = np.atleast_2d(meta_data) table = Orange.data.Table.from_numpy(domain, y_data.astype(float, order='C'), metas=meta_data) return table
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [ np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col]) ] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) else: try: values = [float(i) for i in orig_values] except ValueError: tvar = TimeVariable('_') try: values = [tvar.parse(i) for i in orig_values] except ValueError: coltype = StringVariable else: coltype = TimeVariable else: coltype = ContinuousVariable if valuemap: # Map discrete data to ints def valuemap_index(val): try: return valuemap.index(val) except ValueError: return np.nan values = np.vectorize(valuemap_index, otypes=[float])(orig_values) coltype = DiscreteVariable coltype_kwargs.update(values=valuemap) if coltype is StringVariable: values = ['' if i is np.nan else i for i in orig_values] if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) if domain_vars is not None: if names and names[col]: # Use existing variable if available var = coltype.make(names[col].strip(), **coltype_kwargs) else: # Never use existing for un-named variables var = coltype(next(NAMEGEN), **coltype_kwargs) var.attributes.update(flag.attributes) domain_vars.append(var) # Reorder discrete values to match existing variable if var.is_discrete and not var.ordered: new_order, old_order = var.values, coltype_kwargs.get( 'values', var.values) if new_order != old_order: offset = len(new_order) column = values if data.ndim > 1 else data column += offset for i, val in enumerate(var.values): try: oldval = old_order.index(val) except ValueError: continue bn.replace(column, offset + oldval, new_order.index(val)) if coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one values = [var.parse(i) for i in orig_values] # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def test_parse_invalid(self): var = TimeVariable('var') with self.assertRaises(ValueError): var.parse('123')
def test_parse_timestamp(self): var = TimeVariable("time") datestr = str(datetime(2016, 6, 14, 23, 8, tzinfo=timezone.utc).timestamp()) ts1 = var.parse(datestr) self.assertEqual(var.repr_val(ts1), '2016-06-14 23:08:00')
] discrete = list(chain(rgb, ints)) def _to_timestamps(years): return [ datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year) else np.nan for year in years ] # Time variable variations, windows timestamps need to be valid timestamps so # we'll just fill it in with arbitrary years time_full = VarDataPair( TimeVariable('time_full'), np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float), ) time_missing = VarDataPair( TimeVariable('time_missing'), np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float), ) time_all_missing = VarDataPair( TimeVariable('time_all_missing'), np.array(_to_timestamps([np.nan] * 5), dtype=float), ) time_same = VarDataPair( TimeVariable('time_same'), np.array(_to_timestamps([2004] * 5), dtype=float), ) time = [time_full, time_missing, time_all_missing, time_same]
) ints_missing = VarDataPair( DiscreteVariable('ints_missing', values=('2', '3', '4'), ordered=True), np.array([0, 1, 1, np.nan, 2], dtype=float), ) def _to_timestamps(years): return [ datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year) else np.nan for year in years ] time_full = VarDataPair( TimeVariable('time_full'), np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float), ) time_missing = VarDataPair( TimeVariable('time_missing'), np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float), ) # String variable variations string_full = VarDataPair( StringVariable('string_full'), np.array(['a', 'b', 'c', 'd', 'e'], dtype=object), ) string_missing = VarDataPair( StringVariable('string_missing'), np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
def _new_var(self): name = self._new_var_name() if self.operation in self.TimePreserving \ and all(isinstance(var, TimeVariable) for var in self.variables): return TimeVariable(name) return ContinuousVariable(name)
def test_repr_value(self): # https://github.com/biolab/orange3/pull/1760 var = TimeVariable('time') self.assertEqual(var.repr_val(Value(var, 416.3)), '416.3')
def test_no_date_no_time(self): self.assertEqual(TimeVariable('relative time').repr_val(1.6), '1.6')
def table_from_frame(df, class_name, *, force_nominal=False): """ Convert pandas.DataFrame to Orange.data.Table Parameters ---------- df : pandas.DataFrame force_nominal : boolean If True, interpret ALL string columns as nominal (DiscreteVariable). Returns ------- Table """ def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666)) def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: # pylint: disable=broad-except pass return False # If df index is not a simple RangeIndex (or similar), put it into data if not (df.index.is_integer() and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)): df = df.reset_index() attrs, metas, calss_vars = [], [], [] X, M = [], [] # Iter over columns for name, s in df.items(): name = str(name) if name == class_name: discrete = s.astype('category').cat calss_vars.append( DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_discrete(s): discrete = s.astype('category').cat attrs.append( DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_datetime(s): tvar = TimeVariable(name) attrs.append(tvar) s = pd.to_datetime(s, infer_datetime_format=True) X.append( s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) elif is_numeric_dtype(s): attrs.append(ContinuousVariable(name)) X.append(s.values) else: metas.append(StringVariable(name)) M.append(s.values.astype(object)) return Table.from_numpy( Domain(attrs, calss_vars, metas), np.column_stack(X) if X else np.empty((df.shape[0], 0)), None, np.column_stack(M) if M else None)
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0]]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend(['']*(rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col])] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format(row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) else: try: values = [float(i) for i in orig_values] except ValueError: tvar = TimeVariable('_') try: values = [tvar.parse(i) for i in orig_values] except ValueError: coltype = StringVariable else: coltype = TimeVariable else: coltype = ContinuousVariable if valuemap: # Map discrete data to ints def valuemap_index(val): try: return valuemap.index(val) except ValueError: return np.nan values = np.vectorize(valuemap_index, otypes=[float])(orig_values) coltype = DiscreteVariable coltype_kwargs.update(values=valuemap) if coltype is StringVariable: values = ['' if i is np.nan else i for i in orig_values] if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) if domain_vars is not None: if names and names[col]: # Use existing variable if available var = coltype.make(names[col].strip(), **coltype_kwargs) else: # Never use existing for un-named variables var = coltype(next(NAMEGEN), **coltype_kwargs) var.attributes.update(flag.attributes) domain_vars.append(var) # Reorder discrete values to match existing variable if var.is_discrete and not var.ordered: new_order, old_order = var.values, coltype_kwargs.get('values', var.values) if new_order != old_order: offset = len(new_order) column = values if data.ndim > 1 else data column += offset for i, val in enumerate(var.values): try: oldval = old_order.index(val) except ValueError: continue bn.replace(column, offset + oldval, new_order.index(val)) if coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one values = [var.parse(i) for i in orig_values] # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs): def map_values(index, _X): values = np.unique(_X[:, index]) values = np.delete(values, np.where(values == "nan")[0]) for j, value in enumerate(values): _X[:, index][_X[:, index] == value] = j return values create_time_var = \ isinstance(val_var, TimeVariable) and \ all(fun in self.TimeVarFunctions for fun in agg_funs) create_cont_var = \ not val_var or val_var.is_continuous and \ (not isinstance(val_var, TimeVariable) or all(fun in self.FloatFunctions for fun in agg_funs)) vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)] if create_time_var: kwargs = { "have_date": val_var.have_date, "have_time": val_var.have_time } attrs = [[TimeVariable(f"{v}", **kwargs) for v in vals]] * 2 attrs.extend([[TimeVariable("Total", **kwargs)]] * 2) elif create_cont_var: attrs = [[ContinuousVariable(f"{v}", 1) for v in vals]] * 2 attrs.extend([[ContinuousVariable("Total", 1)]] * 2) else: attrs = [] for x in (X, X_h): attrs.append([ DiscreteVariable(f"{v}", map_values(i, x)) for i, v in enumerate(vals, 2) ]) for x in (X_v, X_t): attrs.append([DiscreteVariable("Total", map_values(0, x))]) row_var_h = DiscreteVariable(self._row_var.name, values=["Total"]) aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs]) same_row_col = self._col_var is self._row_var extra_vars = [self._row_var, aggr_attr] uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] + [atr.name for atr in attrs[0]]) for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])), uniq_a): if var.name == u: continue if idx == 0: self.renamed.append(self._row_var.name) self._row_var = self._row_var.copy(name=u) if same_row_col: self._col_var = self._row_var row_var_h = row_var_h.copy(name=u) elif idx == 1: self.renamed.append(aggr_attr.name) aggr_attr = aggr_attr.copy(name=u) else: self.renamed.append(var.name) attrs[0][idx - 2] = var.copy(name=u) attrs[1][idx - 2] = var.copy(name=u) if same_row_col: vals = tuple(v.name for v in attrs[0]) self._row_var.make(self._row_var.name, values=vals) vals = tuple(v.name for v in attrs[2]) row_var_h.make(row_var_h.name, vals) return (Domain([self._row_var, aggr_attr] + attrs[0]), Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]), Domain(attrs[3]))
def test_have_date_have_time_in_construct(self): """Test if have_time and have_date is correctly set""" var = TimeVariable('time', have_date=1) self.assertTrue(var.have_date) self.assertFalse(var.have_time)
def setUp(self): self.callback = Mock() self.editor = TimeVariableEditor(self.parent, TimeVariable("var", have_date=1), self.callback)
def test_parse_repr(self): for datestr, timestamp, outstr in self.TESTS: var = TimeVariable('time') ts = var.parse(datestr) self.assertEqual(ts, timestamp, msg=datestr) self.assertEqual(var.repr_val(ts), outstr, msg=datestr)
def test_time(self): X = TimeVariable("X") self._test_common(X)
def read(self): import opusFC if self.sheet: db = self.sheet else: db = self.sheets[0] db = tuple(db.split(" ")) dim = db[1] try: data = opusFC.getOpusData(self.filename, db) except Exception: raise IOError("Couldn't load spectrum from " + self.filename) attrs, clses, metas = [], [], [] attrs = [ ContinuousVariable.make(repr(data.x[i])) for i in range(data.x.shape[0]) ] y_data = None meta_data = None if type(data) == opusFC.MultiRegionDataReturn: y_data = [] meta_data = [] metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region'), TimeVariable.make('start_time') ]) for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) start_time = region.start_time meta_region = np.column_stack( (mapX, mapY, map_region, start_time)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.MultiRegionTRCDataReturn: y_data = [] meta_data = [] metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y'), StringVariable.make('map_region') ]) attrs = [ ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels)) ] for region in data.regions: y_data.append(region.spectra) mapX = region.mapX mapY = region.mapY map_region = np.full_like(mapX, region.title, dtype=object) meta_region = np.column_stack((mapX, mapY, map_region)) meta_data.append(meta_region.astype(object)) y_data = np.vstack(y_data) meta_data = np.vstack(meta_data) elif type(data) == opusFC.ImageDataReturn: metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y') ]) data_3D = data.spectra for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.ImageTRCDataReturn: metas.extend([ ContinuousVariable.make('map_x'), ContinuousVariable.make('map_y') ]) attrs = [ ContinuousVariable.make(repr(data.labels[i])) for i in range(len(data.labels)) ] data_3D = data.traces for i in np.ndindex(data_3D.shape[:1]): map_y = np.full_like(data.mapX, data.mapY[i]) coord = np.column_stack((data.mapX, map_y)) if y_data is None: y_data = data_3D[i] meta_data = coord.astype(object) else: y_data = np.vstack((y_data, data_3D[i])) meta_data = np.vstack((meta_data, coord)) elif type(data) == opusFC.TimeResolvedTRCDataReturn: y_data = data.traces elif type(data) == opusFC.TimeResolvedDataReturn: metas.extend([ContinuousVariable.make('z')]) y_data = data.spectra meta_data = data.z elif type(data) == opusFC.SingleDataReturn: y_data = data.y[None, :] else: raise ValueError( "Empty or unsupported opusFC DataReturn object: " + type(data)) import_params = ['SRT', 'SNM'] for param_key in import_params: try: param = data.parameters[param_key] except KeyError: pass # TODO should notify user? else: try: param_name = opusFC.paramDict[param_key] except KeyError: param_name = param_key if param_key == 'SRT': var = TimeVariable.make(param_name) elif type(param) is float: var = ContinuousVariable.make(param_name) elif type(param) is str: var = StringVariable.make(param_name) else: raise ValueError #Found a type to handle metas.extend([var]) params = np.full((y_data.shape[0], ), param, np.array(param).dtype) if meta_data is not None: # NB dtype default will be np.array(fill_value).dtype in future meta_data = np.column_stack( (meta_data, params.astype(object))) else: meta_data = params domain = Orange.data.Domain(attrs, clses, metas) meta_data = np.atleast_2d(meta_data) table = Orange.data.Table.from_numpy(domain, y_data.astype(float, order='C'), metas=meta_data) return table
def vars_from_df(df, role=None, force_nominal=False): if role is None and hasattr(df, 'orange_role'): role = df.orange_role df = _reset_index(df) cols = [], [], [] exprs = [], [], [] vars_ = [], [], [] for column in df.columns: s = df[column] _role = Role.Attribute if role is None else role if hasattr(df, 'orange_variables') and column in df.orange_variables: original_var = df.orange_variables[column] var = original_var.copy(compute_value=None) expr = None elif _is_datetime(s): var = TimeVariable(str(column)) expr = _convert_datetime elif _is_discrete(s, force_nominal): discrete = s.astype("category").cat var = DiscreteVariable(str(column), discrete.categories.astype(str).tolist()) expr = to_categorical elif is_numeric_dtype(s): var = ContinuousVariable( # set number of decimals to 0 if int else keeps default behaviour str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)) expr = None else: if role is not None and role != Role.Meta: raise ValueError("String variable must be in metas.") _role = Role.Meta var = StringVariable(str(column)) expr = lambda s, _: np.asarray(s, dtype=object) cols[_role].append(column) exprs[_role].append(expr) vars_[_role].append(var) xym = [] for a_vars, a_cols, a_expr in zip(vars_, cols, exprs): if not a_cols: arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0)) elif not any(a_expr): # if all c in columns table will share memory with dataframe a_df = df if all(c in a_cols for c in df.columns) else df[a_cols] if all(isinstance(a, SparseDtype) for a in a_df.dtypes): arr = csr_matrix(a_df.sparse.to_coo()) else: arr = np.asarray(a_df) else: # we'll have to copy the table to resolve any expressions arr = np.array([ expr(df[col], var) if expr else np.asarray(df[col]) for var, col, expr in zip(a_vars, a_cols, a_expr) ]).T xym.append(arr) # Let the tables share memory with pandas frame if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1: xym[1] = xym[1][:, 0] return xym, Domain(*vars_)
def test_parse_timestamp(self): var = TimeVariable("time") datestr = str( datetime(2016, 6, 14, 23, 8, tzinfo=timezone.utc).timestamp()) ts1 = var.parse(datestr) self.assertEqual(var.repr_val(ts1), '2016-06-14 23:08:00')
class TwitterAPI: """ Fetch tweets from the Tweeter API. Notes: Results across multiple searches are aggregated. To remove tweets form previous searches and only return results from the last search either call `reset` method before searching or provide `collecting=False` argument to search method. """ attributes = [] class_vars = [] tv = TimeVariable("Date") authors = [ (DiscreteVariable("Author"), lambda doc: "@" + doc.author.screen_name,), ] metas = [ ( StringVariable("Content"), lambda doc: doc.full_text if not doc.retweeted else doc.text, ), # temporary fix until Orange>3.30.1 then change back to # (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())), (tv, lambda doc: TwitterAPI.tv.parse( TwitterAPI.tv._tzre_sub(doc.created_at.isoformat()))), (DiscreteVariable("Language"), lambda doc: doc.lang), ( DiscreteVariable("Location"), lambda doc: getattr(doc.place, "country_code", None), ), ( ContinuousVariable("Number of Likes", number_of_decimals=0), lambda doc: doc.favorite_count, ), ( ContinuousVariable("Number of Retweets", number_of_decimals=0), lambda doc: doc.retweet_count, ), ( DiscreteVariable("In Reply To"), lambda doc: "@" + doc.in_reply_to_screen_name if doc.in_reply_to_screen_name else "", ), (DiscreteVariable("Author Name"), lambda doc: doc.author.name), ( StringVariable("Author Description"), lambda doc: doc.author.description, ), ( ContinuousVariable("Author Statuses Count", number_of_decimals=0), lambda doc: doc.author.statuses_count, ), ( ContinuousVariable("Author Favourites Count", number_of_decimals=0), lambda doc: doc.author.favourites_count, ), ( ContinuousVariable("Author Friends Count", number_of_decimals=0), lambda doc: doc.author.friends_count, ), ( ContinuousVariable("Author Followers Count", number_of_decimals=0), lambda doc: doc.author.followers_count, ), ( ContinuousVariable("Author Listed Count", number_of_decimals=0), lambda doc: doc.author.listed_count, ), ( DiscreteVariable("Author Verified"), lambda doc: str(doc.author.verified), ), ( ContinuousVariable("Longitude"), lambda doc: coordinates_geoJSON(doc.coordinates)[0], ), ( ContinuousVariable("Latitude"), lambda doc: coordinates_geoJSON(doc.coordinates)[1], ), ] text_features = [metas[0][0]] # Content string_attributes = [m for m, _ in metas if isinstance(m, StringVariable)] def __init__(self, credentials): self.key = credentials self.api = tweepy.API(credentials.auth) self.container = OrderedDict() self.search_history = [] @property def tweets(self): return self.container.values() def search_content( self, content, *, max_tweets=0, lang=None, allow_retweets=True, collecting=False, callback=None ): """ Search by content. Args: content (list of str): A list of key words to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. lang (str): A language's code (either ISO 639-1 or ISO 639-3 formats). allow_retweets(bool): Whether to download retweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: max_tweets = float("Inf") def build_query(): nonlocal content if not content: q = "from: " else: if not isinstance(content, list): content = [content] q = " OR ".join(['"{}"'.format(q) for q in content]) if not allow_retweets: q += " -filter:retweets" return q query = build_query() cursor = tweepy.Cursor( self.api.search_tweets, q=query, lang=lang, tweet_mode="extended" ) corpus, count = self.fetch( cursor, max_tweets, search_author=False, callback=callback ) self.append_history( "Content", content, lang if lang else "Any", str(allow_retweets), count, ) return corpus def search_authors( self, authors, *, max_tweets=0, collecting=False, callback=None ): """ Search by authors. Args: authors (list of str): A list of authors to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: # set to max allowed for progress max_tweets = 3200 if not isinstance(authors, list): authors = [authors] cursors = [ tweepy.Cursor( self.api.user_timeline, screen_name=a, tweet_mode="extended" ) for a in authors ] corpus, count = self.fetch( cursors, max_tweets, search_author=True, callback=callback ) self.append_history("Author", authors, None, None, count) return corpus def fetch(self, cursors, max_tweets, search_author, callback): if not isinstance(cursors, list): cursors = [cursors] count = 0 for i, cursor in enumerate(cursors): for j, tweet in enumerate(cursor.items(max_tweets), start=1): if tweet.id not in self.container: count += 1 self.container[tweet.id] = tweet if j % 20 == 0: if callback is not None: callback( (i * max_tweets + j) / (len(cursors) * max_tweets) ) return self.create_corpus(search_author), count def create_corpus(self, search_author): if search_author: class_vars = self.authors metas = self.metas else: class_vars = [] metas = self.metas + self.authors return Corpus.from_documents( self.tweets, "Twitter", self.attributes, class_vars, metas, title_indices=[-1], ) def reset(self): """ Removes all downloaded tweets. """ self.search_history = [] self.container = OrderedDict() def append_history(self, mode, query, lang, allow_retweets, n_tweets): query = ", ".join(query) if isinstance(query, Iterable) else query if lang in code2lang.keys(): lang = code2lang[lang] self.search_history.append( ( ("Query", query), ("Search by", mode), ("Language", lang), ("Allow retweets", allow_retweets), ("Tweets count", n_tweets), ) ) def report(self): return self.search_history