def test_table_from_frame_time(self): from Orange.data.pandas_compat import table_from_frame df = pd.DataFrame([[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]]) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()], [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()], [np.nan], ], ) self.assertEqual(table.domain.variables[0].have_time, 1) self.assertEqual(table.domain.variables[0].have_date, 0) df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]]) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()], [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()], [np.nan], ], ) self.assertEqual(table.domain.variables[0].have_time, 1) self.assertEqual(table.domain.variables[0].have_date, 0)
def test_table_from_frame_datetime(self): from Orange.data.pandas_compat import table_from_frame df = pd.DataFrame( [ [pd.Timestamp("2017-12-19 00:00:00.50")], [pd.Timestamp("1724-12-20 20:20:20.30")], [np.nan], ] ) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], [np.nan], ], ) self.assertEqual(table.domain.variables[0].have_time, 1) self.assertEqual(table.domain.variables[0].have_date, 1) df = pd.DataFrame( [["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]] ) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], [np.nan], ], ) self.assertEqual(table.domain.variables[0].have_time, 1) self.assertEqual(table.domain.variables[0].have_date, 1) df = pd.DataFrame( [ [datetime(2017, 12, 19, 0, 0, 0, 500000)], [datetime(1724, 12, 20, 20, 20, 20, 300000)], [np.nan], ] ) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], [np.nan], ], ) self.assertEqual(table.domain.variables[0].have_time, 1) self.assertEqual(table.domain.variables[0].have_date, 1)
def test_table_from_frame(self): from Orange.data.pandas_compat import table_from_frame nan = np.nan df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')], ['b', 0, pd.Timestamp('1724-12-20')], ['c', 0, pd.Timestamp('1724-12-20')], [nan, nan, nan]]) table = table_from_frame(df) np.testing.assert_equal(table.X, [[1, pd.Timestamp('2017-12-19').timestamp()], [0, pd.Timestamp('1724-12-20').timestamp()], [0, pd.Timestamp('1724-12-20').timestamp()], [nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [['a'], ['b'], ['c'], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['1', '2']) self.assertEqual(types, [ContinuousVariable, TimeVariable]) # Force strings nominal table = table_from_frame(df, force_nominal=True) np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], [1, 0, pd.Timestamp('1724-12-20').timestamp()], [2, 0, pd.Timestamp('1724-12-20').timestamp()], [nan, nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['0', '1', '2']) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) # Include index df.index = list('abaa') table = table_from_frame(df) np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], [1, 0, pd.Timestamp('1724-12-20').timestamp()], [0, 0, pd.Timestamp('1724-12-20').timestamp()], [0, nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [['a'], ['b'], ['c'], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['index', '1', '2']) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
def test_table_from_frame(self): from Orange.data.pandas_compat import table_from_frame nan = np.nan df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')], ['b', 0, pd.Timestamp('1724-12-20')], ['c', 0, pd.Timestamp('1724-12-20')], [nan, nan, nan]]) table = table_from_frame(df) np.testing.assert_equal(table.X, [[1, pd.Timestamp('2017-12-19').timestamp()], [0, pd.Timestamp('1724-12-20').timestamp()], [0, pd.Timestamp('1724-12-20').timestamp()], [nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [['a'], ['b'], ['c'], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['1', '2']) self.assertEqual(types, [ContinuousVariable, TimeVariable]) # Force strings nominal table = table_from_frame(df, force_nominal=True) np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], [1, 0, pd.Timestamp('1724-12-20').timestamp()], [2, 0, pd.Timestamp('1724-12-20').timestamp()], [nan, nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['0', '1', '2']) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) # Include index df.index = list('abaa') table = table_from_frame(df) np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], [1, 0, pd.Timestamp('1724-12-20').timestamp()], [0, 0, pd.Timestamp('1724-12-20').timestamp()], [0, nan, nan]]) np.testing.assert_equal(table.metas.tolist(), [['a'], ['b'], ['c'], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ['index', '1', '2']) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
def testa_table_from_frame_string(self): """ Test if string-like variables are handled correctly and nans are replaced with String.Unknown """ from Orange.data.pandas_compat import table_from_frame # s1 contains nan and s2 contains pd.Na df = pd.DataFrame( [["a", "b"], ["c", "d"], ["e", "f"], [5, "c"], [np.nan, np.nan]], columns=["s1", "s2"], ).astype({ "s1": "object", "s2": "string" }) table = table_from_frame(df) np.testing.assert_array_equal(np.empty((5, 0)), table.X) np.testing.assert_array_equal( np.array([ ["a", "b"], ["c", "d"], ["e", "f"], ["5", "c"], [StringVariable.Unknown, StringVariable.Unknown], ]), table.metas, ) self.assertTrue( all(isinstance(v, StringVariable) for v in table.domain.metas))
def test_table_from_frame_keep_ids(self): """ Test if indices are correctly transferred to Table""" from Orange.data.pandas_compat import table_from_frame df = OrangeDataFrame(Table('iris')[:6]) df.index = [1, "_oa", "_o", "1", "_o20", "_o30"] table = table_from_frame(df) self.assertEqual(table.ids[-2:].tolist(), [20, 30]) self.assertTrue(np.issubdtype(table.ids.dtype, np.number))
def test_table_from_frame_no_datetim(self): """ In case when dtype of column is object and column contains numbers only, column could be recognized as a TimeVarialbe since pd.to_datetime can parse numbers as datetime. That column must be result either in StringVariable or DiscreteVariable since it's dtype is object. """ from Orange.data.pandas_compat import table_from_frame df = pd.DataFrame([[1], [2], [3]], dtype="object") table = table_from_frame(df) # check if exactly ContinuousVariable and not subtype TimeVariable self.assertIsInstance(table.domain.metas[0], StringVariable) df = pd.DataFrame([[1], [2], [2]], dtype="object") table = table_from_frame(df) # check if exactly ContinuousVariable and not subtype TimeVariable self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)
def graph2tables(g, name="network"): import pandas as pd from Orange.data.pandas_compat import table_from_frame # vertices data table vattrs = g.vs.attribute_names() vertices = pd.DataFrame() for vattr in vattrs: vertices[vattr] = g.vs[vattr] # when created with Graph.DictList(), the id,source,target attributes are strings vertices["id"] = [vertex.index for vertex in g.vs] #update a vertex's id to it's index vtable = table_from_frame(vertices) vtable.X[:, 0].astype(np.int) vtable.name = name + "*vertices" # edges data table source = [] target = [] for edge in g.es: source.append(edge.tuple[0]) # index of source vertex target.append(edge.tuple[1]) # index of target vertex edges = pd.DataFrame({"source": source, "target": target}) eattrs = g.es.attribute_names() try: # if an igraph has the "name" attribute, it's built with Graph.DistList() vertex_names = g.vs[ "name"] # there'll be extra "source" and "target" attributes eattrs.remove("source") # then need to get rid or it eattrs.remove("target") # since we'll get it from edge.tuple instead except: # otherwise, the graph is built with Graph.TupleList() pass # will get KeyError or ValueError, just pass for eattr in eattrs: edges[eattr] = g.es[eattr] if g.is_directed(): edges["isDirected"] = 1 # True else: edges["isDirected"] = 0 # False etable = table_from_frame(edges) etable.X[:, 0].astype(np.int) etable.X[:, 1].astype(np.int) etable.name = name + "*edges" # return two tables of vertices and edges return (vtable, etable)
def callR(self, table): import rpy2.robjects as robjects if table: r_source = robjects.r['source'] r_source(str(r_path)) returnvalue = str(robjects.r('testfun("Wouter")')) self.Outputs.corpus.send( table_from_frame( pandas.DataFrame.from_dict([{ 'value': returnvalue }])))
def load(self): files = list(Path(self.directory).glob(self._glob)) self.progress.advance(0) mails = [] self.progress.iter = len(files) for i, filename in enumerate(files): try: mails.append(parse_enron_mail(filename)) except Exception as e: print(filename) print(e) self.progress.advance() table = table_from_frame(pd.DataFrame(mails)) self.Outputs.data.send(Corpus.from_table(table.domain, table))
def test_time_variable_compatible(self): from Orange.data.pandas_compat import table_from_frame def to_df(val): return pd.DataFrame([[pd.Timestamp(val)]]) for datestr, timestamp, outstr in TestTimeVariable.TESTS: var = TimeVariable("time") var_parse = var.to_val(datestr) try: pandas_parse = table_from_frame(to_df(datestr)).X[0, 0] except ValueError: # pandas cannot parse some formats in the list skip them continue if not (np.isnan(var_parse) and np.isnan(pandas_parse)): # nan == nan => False self.assertEqual(var_parse, pandas_parse) self.assertEqual(pandas_parse, timestamp) self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse)) self.assertEqual(outstr, var.repr_val(var_parse))
def finance_data(symbol, since=None, until=None, granularity='d'): """Fetch Yahoo Finance data for stock or index `symbol` within the period after `since` and before `until` (both inclusive). Parameters ---------- symbol: str A stock or index symbol, as supported by Yahoo Finance. since: date A start date (default: 1900-01-01). until: date An end date (default: today). granularity: 'd' or 'w' or 'm' or 'v' What data to get: daily, weekly, monthly, or dividends. Returns ------- data : Timeseries """ if since is None: since = date(1900, 1, 1) if until is None: until = date.today() f = web.DataReader(symbol, 'yahoo', since, until) data = Timeseries(table_from_frame(f)) # Make Adjusted Close a class variable attrs = [var.name for var in data.domain.attributes] attrs.remove('Adj Close') data = Timeseries(Domain(attrs, [data.domain['Adj Close']], None, source=data.domain), data) data.name = symbol data.time_variable = data.domain['Date'] return data
def finance_data(symbol, since=None, until=None, granularity='d'): """Fetch Yahoo Finance data for stock or index `symbol` within the period after `since` and before `until` (both inclusive). Parameters ---------- symbol: str A stock or index symbol, as supported by Yahoo Finance. since: date A start date (default: 1900-01-01). until: date An end date (default: today). granularity: 'd' or 'w' or 'm' or 'v' What data to get: daily, weekly, monthly, or dividends. Returns ------- data : Timeseries """ if since is None: since = date(1900, 1, 1) if until is None: until = date.today() f = web.DataReader(symbol, 'yahoo', since, until) data = Timeseries.from_data_table(table_from_frame(f)) # Make Adjusted Close a class variable attrs = [var.name for var in data.domain.attributes] attrs.remove('Adj Close') data = Timeseries.from_table( Domain(attrs, [data.domain['Adj Close']], None, source=data.domain), data) data.name = symbol data.time_variable = data.domain['Date'] return data
def test_table_from_frame(self): from Orange.data.pandas_compat import table_from_frame nan = np.nan df = pd.DataFrame( [ ["a", 1, pd.Timestamp("2017-12-19")], ["b", 0, pd.Timestamp("1724-12-20")], ["c", 0, pd.Timestamp("1724-12-20")], [nan, nan, nan], ] ) table = table_from_frame(df) np.testing.assert_equal( table.X, [ [1, pd.Timestamp("2017-12-19").timestamp()], [0, pd.Timestamp("1724-12-20").timestamp()], [0, pd.Timestamp("1724-12-20").timestamp()], [nan, nan], ], ) np.testing.assert_equal(table.metas.tolist(), [["a"], ["b"], ["c"], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ["1", "2"]) self.assertEqual(types, [ContinuousVariable, TimeVariable]) # Force strings nominal table = table_from_frame(df, force_nominal=True) np.testing.assert_equal( table.X, [ [0, 1, pd.Timestamp("2017-12-19").timestamp()], [1, 0, pd.Timestamp("1724-12-20").timestamp()], [2, 0, pd.Timestamp("1724-12-20").timestamp()], [nan, nan, nan], ], ) np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ["0", "1", "2"]) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) # Include index df.index = list("abaa") table = table_from_frame(df) np.testing.assert_equal( table.X, [ [0, 1, pd.Timestamp("2017-12-19").timestamp()], [1, 0, pd.Timestamp("1724-12-20").timestamp()], [0, 0, pd.Timestamp("1724-12-20").timestamp()], [0, nan, nan], ], ) np.testing.assert_equal(table.metas.tolist(), [["a"], ["b"], ["c"], [nan]]) names = [var.name for var in table.domain.attributes] types = [type(var) for var in table.domain.attributes] self.assertEqual(names, ["index", "1", "2"]) self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
def runner( res: ResolweAPI, data_objects: List[Data], options: DataOutputOptions, exp_type: int, proc_type: int, input_annotation: int, state: TaskState, ) -> Table: data_frames = [] metadata = defaultdict(list) def parse_sample_descriptor(sample: Sample) -> None: general = sample.descriptor.get('general', {}) for label in SAMPLE_DESCRIPTOR_LABELS: metadata[label].append([general.get(label, '')]) metadata['sample_name'].append([sample.name]) exp_type = file_output_field = options.expression[exp_type].type proc_type = options.process[proc_type].type source = options.input_annotation[input_annotation].source species = options.input_annotation[input_annotation].species build = options.input_annotation[input_annotation].build # apply filters data_objects = [obj for obj in data_objects if obj.process.type == proc_type] data_objects = [ obj for obj in data_objects if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build ] if exp_type != 'rc': file_output_field = 'exp' data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type] if not data_objects: raise ResolweDataObjectsNotFound step, steps = 0, len(data_objects) + 3 def set_progress(): nonlocal step step += 1 state.set_progress_value(100 * (step / steps)) state.set_status('Downloading ...') for data_object in data_objects: set_progress() parse_sample_descriptor(data_object.sample) metadata['expression_type'].append([exp_type.upper()]) response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file']) with io.BytesIO() as f: f.write(response.content) f.seek(0) # expressions to data frame df = pd.read_csv(f, sep='\t', compression='gzip') df = df.set_index('Gene').T.reset_index(drop=True) data_frames.append(df) state.set_status('Concatenating samples ...') df = pd.concat(data_frames, axis=0) state.set_status('To data table ...') table = table_from_frame(df) set_progress() state.set_status('Adding metadata ...') metas = [StringVariable(label) for label in metadata.keys()] domain = Domain(table.domain.attributes, table.domain.class_vars, metas) table = table.transform(domain) for key, value in metadata.items(): table[:, key] = value set_progress() state.set_status('Matching genes ...') tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' set_progress() return table
"宁夏","陕西","山西","湖北","湖南","四川","云南","河北","河南","辽宁","山东","天津",\ "江西","江苏","上海","浙江","吉林","内蒙古","黑龙江","南海群岛"] # 生成符合Orange Geo格式的数据 gdf9 = gdf7[["admin", "name"]] gdf9["province"] = provs3 gdf10 = gdf[["area", "confirmed", "lat", "lng"]] gdf11 = pd.merge(gdf9, gdf10, left_on="province", right_on="area") # 输出CSV,再从Orange读入 gdf11.to_csv("covid-19-CHN.csv", index=False, encoding="utf-8") # 城市疫情数据转换为Orange data table,以便调试Orange Widget dfcc3 = pd.merge(gdf9, dfcc, left_on="province", right_on="area") dfcc3.rename(columns={"name_x": "name", "name_y": "city2"}, inplace=True) dfcc3.to_csv("D:/temp/COVID-19/cities_CHN.csv", encoding="utf-8", index=False) from Orange.data.pandas_compat import table_from_frame otc = table_from_frame(dfcc3) otc.save("D:/temp/COVID-19/covid-19-cities_CHN2.tab") # 墨卡托->gcj->GPS # 各市 from coord_convert.transform import gcj2wgs import pandas as pd dfcc3 = pd.read_csv("D:/temp/COVID-19/cities_CHN.csv", encoding="utf-8") lats = [] longs = [] for idx, row in dfcc3.iterrows(): lng, lat = Mercator_to_lonlat(row.lng, row.lat) x, y = gcj2wgs(lng, lat) lats.append(y) longs.append(x) dfcc3["lat"] = lats
def test_table_from_frame_timezones(self): from Orange.data.pandas_compat import table_from_frame df = pd.DataFrame([ [pd.Timestamp("2017-12-19 00:00:00")], [pd.Timestamp("1724-12-20 20:20:20")], [np.nan], ]) table = table_from_frame(df) self.assertEqual(table.domain.variables[0].timezone, timezone.utc) df = pd.DataFrame([ [pd.Timestamp("2017-12-19 00:00:00Z")], [pd.Timestamp("1724-12-20 20:20:20Z")], [np.nan], ]) table = table_from_frame(df) self.assertEqual(pytz.utc, table.domain.variables[0].timezone) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00").timestamp()], [pd.Timestamp("1724-12-20 20:20:20").timestamp()], [np.nan], ], ) df = pd.DataFrame([ [pd.Timestamp("2017-12-19 00:00:00+1")], [pd.Timestamp("1724-12-20 20:20:20+1")], [np.nan], ]) table = table_from_frame(df) self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()], [np.nan], ], ) df = pd.DataFrame([ [pd.Timestamp("2017-12-19 00:00:00", tz="CET")], [pd.Timestamp("1724-12-20 20:20:20", tz="CET")], [np.nan], ]) table = table_from_frame(df) self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()], [np.nan], ], ) df = pd.DataFrame([ [pd.Timestamp("2017-12-19 00:00:00", tz="CET")], [pd.Timestamp("1724-12-20 20:20:20")], [np.nan], ]) table = table_from_frame(df) self.assertEqual(pytz.utc, table.domain.variables[0].timezone) np.testing.assert_equal( table.X, [ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], [pd.Timestamp("1724-12-20 20:20:20").timestamp()], [np.nan], ], )