def test_column_dups2(self): # drop buggy GH 6240 df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) result = df2.drop('C', axis=1) assert_frame_equal(result, expected) # dropna df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan expected = df.dropna(subset=['A', 'B', 'C'], how='all') expected.columns = ['A', 'A', 'B', 'C'] df.columns = ['A', 'A', 'B', 'C'] result = df.dropna(subset=['A', 'C'], how='all') assert_frame_equal(result, expected)
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected)
def test_pivot_index_none(self): # gh-3962 data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data).set_index("index") result = frame.pivot(columns="columns", values="values") expected = DataFrame({"One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}}) expected.index.name, expected.columns.name = "index", "columns" assert_frame_equal(result, expected) # omit values result = frame.pivot(columns="columns") expected.columns = pd.MultiIndex.from_tuples([("values", "One"), ("values", "Two")], names=[None, "columns"]) expected.index.name = "index" assert_frame_equal(result, expected, check_names=False) self.assertEqual(result.index.name, "index") self.assertEqual(result.columns.names, (None, "columns")) expected.columns = expected.columns.droplevel(0) data = { "index": range(7), "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } result = frame.pivot(columns="columns", values="values") expected.columns.name = "columns" assert_frame_equal(result, expected)
def datatype_records_to_subset_and_migrate(likechars): stmt_for_pkeys = conn_popler_2.execute( select( from_obj=Maintable, columns=[ column('lter_proj_site'), column('samplingprotocol') ]). where( column('samplingprotocol').like( '%{}%'.format(likechars)) ) ) data = DataFrame(stmt_for_pkeys.fetchall()) data.columns = stmt_for_pkeys.keys() records_to_get = data['lter_proj_site'].values.tolist() stmt_for_records = conn_popler_2.execute( select( from_table=Rawtable, ). where(column('lter_proj_site').in_(records_to_get)). order_by('sampleid') ) data2 = DataFrame(stmt_for_records.fetchall()) data2.columns = stmt_for_records.keys() data2.drop('individ', axis=1, inplace=True)
def clustering(self, X, NUM_CLUSTERS, MINIBATCH): ''' k平均法によってクラス分け ''' if MINIBATCH: km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10) else: km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1) km.fit(X) transformed = km.transform(X) #商品の各クラスの中心への距離 labels = km.labels_ dists = [] for i in range(len(labels)): dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離 labels = DataFrame(labels) dists = DataFrame(dists) labels.columns = ['label'] dists.columns = ['dists'] self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える return km
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected)
def test_pivot_index_none(self): # gh-3962 data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data).set_index('index') result = frame.pivot(columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' assert_frame_equal(result, expected) # omit values result = frame.pivot(columns='columns') expected.columns = pd.MultiIndex.from_tuples([('values', 'One'), ('values', 'Two')], names=[None, 'columns']) expected.index.name = 'index' tm.assert_frame_equal(result, expected, check_names=False) assert result.index.name == 'index' assert result.columns.names == (None, 'columns') expected.columns = expected.columns.droplevel(0) result = frame.pivot(columns='columns', values='values') expected.columns.name = 'columns' tm.assert_frame_equal(result, expected)
def retrieve_from_db_usa(): """imports model, pulls mwh production data from db, and places into pandas df. Also pulls state for each plant_name, and places into dict.""" # add parent directory to the path, so can import model.py # need model in order to update the database when this task is activated by cron import os parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) os.sys.path.insert(0,parentdir) import model s = model.connect() # retrive DECEMBER production data, for all turbines at all power plants in California USA_gen_dec13_obj = s.execute('SELECT plant_name, state, fuel_type, dec_mwh_gen FROM "ProdGensDec2013" ') USA_gen_dec13_data = USA_gen_dec13_obj.fetchall() df_dec2013 = DataFrame(USA_gen_dec13_data) df_dec2013.columns = ['plant_name', 'state', 'fuel_type', 'dec_mwh_gen'] # retrive JAN-NOV 2014 production data, for all turbines at all power plants in USA USA_gen_2014_obj = s.execute('SELECT plant_name, state, fuel_type, jan_mwh_gen, feb_mwh_gen, mar_mwh_gen, apr_mwh_gen, may_mwh_gen, jun_mwh_gen, jul_mwh_gen, aug_mwh_gen, sep_mwh_gen, oct_mwh_gen, nov_mwh_gen FROM "ProdGens" ') USA_gen_2014_data = USA_gen_2014_obj.fetchall() df_2014 = DataFrame(USA_gen_2014_data) df_2014.columns = ['plant_name', 'state', 'fuel_type', 'jan_mwh_gen', 'feb_mwh_gen', 'mar_mwh_gen', 'apr_mwh_gen', 'may_mwh_gen', 'jun_mwh_gen', 'jul_mwh_gen', 'aug_mwh_gen', 'sep_mwh_gen', 'oct_mwh_gen', 'nov_mwh_gen'] return df_dec2013, df_2014
def save_to_file(self, fn): gg = DataFrame(self.power_series_apps_table) try: del gg['diff1'] del gg['diff2'] except Exception: print('') gg['Loc Events'] = self.loc.events_apps_1min['Apps'] apps = self.loc.metadata.get_channels() sd = {} #Initialize series with 0s for app in apps: sd[app] = Series(0, index=gg.index) #Count location events for each appliance for index, row in gg.iterrows(): try: if len(row['Loc Events']) > 0: for app in apps: n = row['Loc Events'].count(app) sd[app][index] = n except Exception: continue if self.loc.name == 'REDD': sd[(3,4)] = sd[3] sd[(10,20)] = sd[10] del sd[3] del sd[4] del sd[10] del sd[20] #Change column names and append them to gral table locevents = DataFrame(sd) locevents.columns = [(str(col) + ' locEv') for col in locevents] for locEv in locevents: gg[locEv] = locevents[locEv] #Get power values of each appliance and resample for 1min act = DataFrame(self.loc.appliances_consuming_times) act = act.resample('1Min') if self.loc.name == 'REDD': del act[3] del act[10] act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)] act.columns = [(str(col) + ' conEv') for col in act] for app in act: gg[app] = act[app] gg.columns = [str(col) for col in gg] gg = gg[sorted(gg.columns)] gg.to_csv(fn) return
def _series_add_constant(data, prepend): const = np.ones_like(data) const.name = 'const' if not prepend: results = DataFrame([data, const]).T results.columns = [data.name, 'const'] else: results = DataFrame([const, data]).T results.columns = ['const', data.name] return results
def test_conversion_multiindex(self): d = {'comp_str': ["Fe2", "MnO2"]} df_1lvl = DataFrame(data=d) df_1lvl = StrToComposition().featurize_dataframe( df_1lvl, 'comp_str', multiindex=True) self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) df_2lvl = StrToComposition().featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id='test') df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # if two level multiindex provided as target, it should be written there # here we test converting multiindex in place df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False) self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # Try inplace multiindex conversion with return errors df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, return_errors=True, ignore_errors=True) self.assertTrue( all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
def test_to_csv_dups_cols(self): df = DataFrame(np.random.randn(1000, 30), columns=lrange( 15) + lrange(15), dtype='float64') with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine result = read_csv(filename, index_col=0) result.columns = df.columns assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') df_bool = DataFrame(True, index=df_float.index, columns=lrange(3)) df_object = DataFrame('foo', index=df_float.index, columns=lrange(3)) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=lrange(3)) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True) cols = [] for i in range(5): cols.extend([0, 1, 2]) df.columns = cols from pandas import to_datetime with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) # date cols for i in ['0.4', '1.4', '2.4']: result[i] = to_datetime(result[i]) result.columns = df.columns assert_frame_equal(result, df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf N = 10 df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) result = result.rename(columns={'a.1': 'a'}) assert_frame_equal(result, df)
def compute_one(t, df, **kwargs): if t.grouper.iscolumn: grouper = compute(t.grouper, {t.child: df}) # a Series elif isinstance(t.grouper, Projection) and t.grouper.child is t.child: grouper = t.grouper.columns # list of column names if isinstance(t.apply, Summary): names = t.apply.names preapply = DataFrame(dict(zip( names, [compute(v.child, {t.child: df}) for v in t.apply.values]))) df2 = concat_nodup(df, preapply) groups = df2.groupby(grouper) d = defaultdict(list) for name, v in zip(names, t.apply.values): d[name].append(getattr(Series, v.symbol)) result = groups.agg(dict(d)) # Rearrange columns to match names order result = result[sorted(list(result.columns), key=lambda t: names.index(t[0]))] result.columns = t.apply.names # flatten down multiindex if isinstance(t.apply, Reduction): names = t.apply.dshape[0].names preapply = compute(t.apply.child, {t.child: df}) # Pandas and Blaze column naming schemes differ # Coerce DataFrame column names to match Blaze's names preapply = preapply.copy() if isinstance(preapply, Series): preapply.name = names[0] else: preapply.columns = names df2 = concat_nodup(df, preapply) if t.apply.child.iscolumn: groups = df2.groupby(grouper)[names[0]] else: groups = df2.groupby(grouper)[names] result = compute_one(t.apply, groups) # do reduction result = DataFrame(result).reset_index() result.columns = t.columns return result
def deserialize(self, item, force_bytes_to_unicode=False): index = self._index_from_records(item) column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']] multi_column = item.dtype.metadata.get('multi_column') if len(item) == 0: rdata = item[column_fields] if len(column_fields) > 0 else None if multi_column is not None: columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) return DataFrame(rdata, index=index, columns=columns) else: return DataFrame(rdata, index=index) columns = item.dtype.metadata['columns'] df = DataFrame(data=item[column_fields], index=index, columns=columns) if multi_column is not None: df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) if force_bytes_to_unicode: # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow # of people migrating to py3. # https://github.com/manahl/arctic/issues/598 # This should not be used for a normal flow, and you should instead of writing unicode strings # if you want to work with str in py3., for c in df.select_dtypes(object): # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc' # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc" if type(df[c].iloc[0]) == bytes: df[c] = df[c].str.decode('utf-8') if isinstance(df.index, MultiIndex): unicode_indexes = [] # MultiIndex requires a conversion at each level. for level in range(len(df.index.levels)): _index = df.index.get_level_values(level) if isinstance(_index[0], bytes): _index = _index.astype('unicode') unicode_indexes.append(_index) df.index = unicode_indexes else: if type(df.index[0]) == bytes: df.index = df.index.astype('unicode') if type(df.columns[0]) == bytes: df.columns = df.index.astype('unicode') return df
def test_set_value_by_index(self): # See gh-12344 df = DataFrame(np.arange(9).reshape(3, 3).T) df.columns = list('AAA') expected = df.iloc[:, 2] df.iloc[:, 0] = 3 assert_series_equal(df.iloc[:, 2], expected) df = DataFrame(np.arange(9).reshape(3, 3).T) df.columns = [2, float(2), str(2)] expected = df.iloc[:, 1] df.iloc[:, 0] = 3 assert_series_equal(df.iloc[:, 1], expected)
def components(self): """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- a DataFrame """ from pandas import DataFrame columns = ['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds'] hasnans = self.hasnans if hasnans: def f(x): if isnull(x): return [np.nan] * len(columns) return x.components else: def f(x): return x.components result = DataFrame([f(x) for x in self]) result.columns = columns if not hasnans: result = result.astype('int64') return result
def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis( ['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_applymap(self): applied = self.frame.applymap(lambda x: x * 2) assert_frame_equal(applied, self.frame * 2) result = self.frame.applymap(type) # GH #465, function returning tuples result = self.frame.applymap(lambda x: (x, x)) tm.assertIsInstance(result['A'][0], tuple) # GH 2909, object conversion to float in constructor? df = DataFrame(data=[1, 'a']) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) df = DataFrame(data=[1., 'a']) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) # GH2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ['a', 'a', 'a', 'a'] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) assert_frame_equal(result, expected) # datetime/timedelta df['datetime'] = Timestamp('20130101') df['timedelta'] = pd.Timedelta('1 min') result = df.applymap(str) for f in ['datetime', 'timedelta']: self.assertEqual(result.loc[0, f], str(df.loc[0, f]))
def test_iloc_setitem_dups(self): # GH 6766 # iloc with a mask aligning from another iloc df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) df = concat([df1, df2], axis=1) expected = df.fillna(3) expected['A'] = expected['A'].astype('float64') inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] tm.assert_frame_equal(df, expected) # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) expected.columns = ['B', 'B'] del df['A'] tm.assert_frame_equal(df, expected) # assign back to self df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] tm.assert_frame_equal(df, expected) # reversed x 2 df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) tm.assert_frame_equal(df, expected)
def time_regex(data, col, form, nulls): ''' Method to format the date columns in the raw data based on user input. Returns 3 formatted columns i.e. (year, month, day) including nulls ''' fields = ['month', 'day', 'year'] if any(isinstance(i, list) for i in col): col = list(chain.from_iterable(col)) else: pass print(type(col)) print(col) if len(nulls) > 0: nulldf = hlp.produce_null_df( len(nulls), nulls, len(data), 'NaN') else: nulldf = DataFrame() try: if col[0] is not None: time_list_re = hlp.strip_time(data, col) else: time_list_re = [] except Exception as e: print(str(e)) raise AttributeError('Could not strip time format') notnull = [x for x in fields if x not in nulls] for i,item in enumerate(form): try: time_form_list = [] for j in time_list_re: time_form_list.append( [ to_datetime( x, format=form[i]) for x in j ] ) if len(time_form_list) > 1: timedf = DataFrame( [list(x) for x in zip( *time_form_list)]) else: timedf = DataFrame(time_form_list[0]) if len(notnull) == 1: timedf.columns = notnull else: pass final = {'formatted': timedf, 'null': nulldf} return final except Exception as e: print(str(e)) print('Trying different format')
def create_df(db='parking.min.db', save_as='parking.df.pickle'): conn = sqlite3.connect(db) rows = conn.execute('''select updated, park_id, free_places from parking_min''').fetchall() ids = list(set([t[1] for t in rows])) data = {} for x in ids: dates = [np.datetime64(r[0], 's') for r in rows if r[1] == x] # updated y = [r[2] for r in rows if r[1] == x] # free_places (target) data[x] = Series(y, index=dates) # convert data to DataFrame df = DataFrame(data) # get the names nr = conn.execute('''SELECT DISTINCT name FROM parking ORDER BY park_id''').fetchall() # replace non ascii chars names = [unicodedata.normalize('NFKD', x[0]).encode('ascii', 'ignore') for x in nr] # remove dots names = [x.replace(u'.', '') for x in names] # assign to columns df.columns = names # destroy where there all are NaNs df = df[pd.notnull(df).any(axis=1)] # save if save_as is not None: df.to_pickle(save_as) return df
def test_blocks_compat_GH9037(self): index = pd.date_range('20000101', periods=10, freq='H') df_mixed = DataFrame(OrderedDict( float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564, -0.60316077, 0.24653374, 0.28668979, -2.51969012, 0.95748401, -1.02970536], int_1=[19680418, 75337055, 99973684, 65103179, 79373900, 40314334, 21290235, 4991321, 41903419, 16008365], str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474', 'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'], float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685, -0.48217572, 0.86229683, 1.08935819, 0.93898739, -0.03030452, 1.43366348], str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9', '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'], int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027, 34193846, 10561746, 24867120, 76131025] ), index=index) # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype('unicode') df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'), orient='split') assert_frame_equal(df_mixed, df_roundtrip, check_index_type=True, check_column_type=True, check_frame_type=True, by_blocks=True, check_exact=True)
def test_include_na(self, sparse, dtype): if sparse: pytest.xfail(reason='nan in index is problematic (GH 16894)') s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def prepare_dataset(filename): """ csvファイルを読み込んで、扱える形のデータフレームに整形する関数 秒以上の列とms以下の列が分離しているために、前処理が必要 Argument filename->str: ファイル名のパス return Data->pandas.DataFrame: 日時と温度を保持するデータフレーム """ #load dataset data = pd.read_csv(filename, skiprows=17, encoding="shift-jis") data.columns=(["No", "Time", "ms", "Temp", "1", "A12345678", "A1234", "A1"]) #reshape dataset index = np.arange(0,len(data)) #data["Time"](yy/MM/DD hh:mm:ss)->hh:mm:ss #hh:mm:ss + ms->datetime date = [str(data["Time"][i]).split(" ") for i in index] date = [date[i][1] + str(":") + str(data["ms"][i]*10**3) for i in index] date = [date[i].split(":") for i in index] date = [datetime.time(int(date[i][0]), int(date[i][1]), int(date[i][2]), int(date[i][3])) for i in index] #make dataset Data =DataFrame(np.c_[date, data["Temp"]]) Data.columns=(["date", "temperature"]) return Data
def test_at_to_fail(self): # at should not fallback # GH 7814 s = Series([1, 2, 3], index=list('abc')) result = s.at['a'] assert result == 1 pytest.raises(ValueError, lambda: s.at[0]) df = DataFrame({'A': [1, 2, 3]}, index=list('abc')) result = df.at['a', 'A'] assert result == 1 pytest.raises(ValueError, lambda: df.at['a', 0]) s = Series([1, 2, 3], index=[3, 2, 1]) result = s.at[1] assert result == 3 pytest.raises(ValueError, lambda: s.at['a']) df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) result = df.at[1, 0] assert result == 3 pytest.raises(ValueError, lambda: df.at['a', 0]) # GH 13822, incorrect error string with non-unique columns when missing # column is accessed df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) df.columns = ['x', 'x', 'z'] # Check that we get the correct value in the KeyError with pytest.raises(KeyError, match=r"\['y'\] not in index"): df[['x', 'y', 'z']]
def splitMNIST(data,random_state): print("\n####################") print("splitMNIST():\n") nrow = data["data"].shape[0] ncol = data["data"].shape[1] label_features = np.hstack( tup = ( np.arange(nrow).reshape((nrow,1)), data['target'].reshape((nrow,1)), data['data'] ) ) label_features = DataFrame(data=label_features) label_features.columns = ['index','label'] + getColnames(ncolSquared = ncol) simpleTrainSet, simpleTestSet = train_test_split( label_features, test_size = 1/7, random_state = random_state ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### del label_features print("\nexiting: splitMNIST()") print("####################") return( simpleTrainSet, simpleTestSet )
def query_CAISODemand_hrly_Series(): """specifically gets demand data""" import os parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) os.sys.path.insert(0,parentdir) import model s = model.connect() demand_obj = s.execute('SELECT time_start, mw_demand FROM "HistoricCAISODemands" WHERE caiso_tac=\'CA ISO-TAC\' and time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' ') demand_entry = demand_obj.fetchall() demand_df = DataFrame(demand_entry) demand_df.columns = ['time_start','mw_demand'] dict_with_datetime_keys = { } for idx,row in enumerate(demand_df.values): time_start = row[0] # check date, since logs show we're missing a few if check_if_bad_date(time_start)!=True: # turn dict into a series. will auto-index on dict keys mw_demand = row[1] dict_with_datetime_keys[time_start] = mw_demand return Series(dict_with_datetime_keys)
def query_CAISONetImports_hrly_Series(): """specifically gets import data""" import os parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) os.sys.path.insert(0,parentdir) import model s = model.connect() imports_obj = s.execute('SELECT time_start, sum(mw_imports) FROM "HistoricCAISONetImports" where time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' GROUP BY time_start ') imports_entry = imports_obj.fetchall() imports_df = DataFrame(imports_entry) imports_df.columns = ['time_start','mw_demand'] dict_with_datetime_keys = { } for idx,row in enumerate(imports_df.values): time_start = row[0] # check date, since logs show we're missing a few if check_if_bad_date(time_start)!=True: # turn dict into a series. will auto-index on dict keys mw_imports = row[1] dict_with_datetime_keys[time_start] = mw_imports return Series(dict_with_datetime_keys)
def index_models_minho(host="http://darwin.di.uminho.pt/models"): """ Retrieves a summary of all models in the database. Parameters ---------- host: the service host (optional, default: http://darwin.di.uminho.pt/models) Returns ------- pandas.DataFrame summary of the models in the database """ uri = host + "/models.json" try: response = requests.get(uri) except requests.ConnectionError as e: logger.error("Cannot reach %s. Are you sure that you are connected to the internet?" % host) raise e if response.ok: try: json = response.json() except Exception as e: logger.error('No json could be decoded from server response coming from {}.'.format(host)) raise e else: index = DataFrame(json, columns=["id", "name", "doi", "author", "year", "formats", "organism", "taxonomy", "optflux_validated"]) index.columns = ["id", "name", "doi", "author", "year", "formats", "organism", "taxonomy", "validated"] return index else: raise Exception("Could not index available models. %s returned status code %d" % (host, response.status_code))
def arrange_aggregates(cumsums, symbols, aggs): for i in symbols: cumsums[i] = cumsums[i].ix[:,0:5] cols = cumsums['ATL'].columns.tolist() cols2 = aggs['ATL'].columns.tolist() cols3 = (aggs['ATL'].columns + '1').tolist() cols.extend(cols2) cols.extend(cols3) ATL = DataFrame(columns = cols) for team in symbols: for Date in cumsums[team]['Date']: Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all() cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date] cumsums_temp = cumsums_temp.reset_index() team_temp = aggs[team] oppenent_temp = DataFrame(aggs[Opponent]) oppenent_temp.columns = cols3 atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1) atl = atl.drop('index', axis=1) atl.columns = cols ATL = pd.concat([ATL, atl], axis = 0) print team ATL.to_csv('final.csv', sep=',', index=False)
def test_read_excel_multiindex(self, read_ext): # see gh-4679 if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext # "mi_column" sheet expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], [2, 3.5, pd.Timestamp("2015-01-02"), False], [3, 4.5, pd.Timestamp("2015-01-03"), False], [4, 5.5, pd.Timestamp("2015-01-04"), True], ], columns=mi, ) actual = pd.read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # "mi_index" sheet expected.index = mi expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "both" sheet expected.columns = mi actual = pd.read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # see gh-11317 # "name_with_int" sheet expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) actual = pd.read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet actual = pd.read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected)
def _set(df: pd.DataFrame, area: Optional[str] = None, currency: Optional[str] = None, inf_adj: Optional[str] = None, unit: Optional[str] = None, seas_adj: Optional[str] = None, ts_type: Optional[str] = None, cumperiods: Optional[int] = None): """Add a multiindex to a dataframe's columns. Characterize a dataframe by adding metadata to its column names by use of multiindexes. Parameters ---------- df : Pandas dataframe area : str or None (default is None) Topic to which the data relates to. currency : str or None (default is None) Currency denomination. inf_adj : str or None (default is None) Whether the data is in constant prices. unit : str or None (default is None) Units in which data is defined. seas_adj : str or None (default is None) Whether the data is seasonally adjusted. ts_type : str or None (default is None) Time series type, generally 'Stock' or 'Flujo'. cumperiods : int or None (default is None) Number of periods accumulated per period. Returns ------- None See also -------- Modifies the dataframe's column names in place. """ colnames = df.columns try: inferred_freq = pd.infer_freq(df.index) except ValueError: warnings.warn( "ValueError: Need at least 3 dates to infer frequency. " "Setting to '-'.", UserWarning) inferred_freq = "-" if inferred_freq is None: warnings.warn( "Metadata: frequency could not be inferred " "from the index. Setting to '-'.", UserWarning) inferred_freq = "-" names = [ "Indicador", "Área", "Frecuencia", "Moneda", "Inf. adj.", "Unidad", "Seas. Adj.", "Tipo", "Acum. períodos" ] if not isinstance(df.columns, pd.MultiIndex): df.columns = pd.MultiIndex.from_product([ colnames, [area], [inferred_freq], [currency], [inf_adj], [unit], [seas_adj], [ts_type], [cumperiods] ], names=names) else: arrays = [] for level in range(0, 9): arrays.append(list(df.columns.get_level_values(level))) arrays[2] = [inferred_freq] * len(df.columns) if area is not None: arrays[1] = [area] * len(df.columns) if currency is not None: arrays[3] = [currency] * len(df.columns) if inf_adj is not None: arrays[4] = [inf_adj] * len(df.columns) if unit is not None: arrays[5] = [unit] * len(df.columns) if seas_adj is not None: arrays[6] = [seas_adj] * len(df.columns) if ts_type is not None: arrays[7] = [ts_type] * len(df.columns) if cumperiods is not None: arrays[8] = [cumperiods] * len(df.columns) try: arrays[8] = list(map(int, arrays[8])) except ValueError: pass tuples = list(zip(*arrays)) df.columns = pd.MultiIndex.from_tuples(tuples, names=names) return
print(cities) print(group) n = 2 data = [[] for x in range(n)] z1 = cities.get_group(1) count = 0 for i in range(0, 20): count = count + 1 z3 = z1[z1['Zones_id'] == count] a = len(z3.index) data[0].append(count) data[1].append(a) df = DataFrame(data).transpose() df.columns = ['Zone_id', 'Average_Calorie'] df.to_csv('zonesorders1.csv', index=False) #city2 n = 2 data = [[] for x in range(n)] z1 = cities.get_group(2) count = 0 for i in range(0, 20): count = count + 1 z3 = z1[z1['Zones_id'] == count] a = len(z3.index) data[0].append(count) data[1].append(a) df = DataFrame(data).transpose()
def get_variance_accumulated(path_dataset, range_=(1, 11)): """Variance computed in parallel. Calculation of accumulated variance in channels and files. Parameter receives the path where the folder with files is located. Calculates the variance only in the first ten people. On the tested computer it took about 10 minutes going through all the files and accumulating the variance. We filter warnings. """ fold_variance = Path(path_dataset) / "variance_accumulated" if not check_exist(path_dataset, "variance_accumulated"): print("Loading the files to calculate variance.") filterwarnings("ignore") accumulate_count = 0 accumulate_avg = 0 accumulate_var = 0 selected_channels = [ "time", "FP1-F7", "F7-T7", "T7-P7", "P7-O1", "FP1-F3", "F3-C3", "C3-P3", "P3-O1", "FP2-F4", "F4-C4", "C4-P4", "P4-O2", "FP2-F8", "F8-T8", "T8-P8-0", "P8-O2", "FZ-CZ", "CZ-PZ", "P7-T7", "T7-FT9", "FT9-FT10", "FT10-T8", "T8-P8-1", ] for id_patient in tqdm_notebook(range(range_[0], range_[1]), desc="Patient"): path_files = join(path_dataset, "chb{0:0=2d}/*.edf".format(id_patient)) files_in_folder = glob(path_files) for enum, file in enumerate( tqdm_notebook(files_in_folder, desc="Files", leave=False)): variance_file = read_raw_edf(input_fname=file, verbose=0).to_data_frame( picks=["eeg"], time_format="ms") # Removing channels that are not present in all files. variance_file = variance_file[ variance_file.columns.intersection(selected_channels)] # Sorting the channels variance_file.sort_index(axis=1, inplace=True) if (enum == 0) & (id_patient == 0): accumulate_count = len(variance_file) accumulate_avg = variance_file.mean() accumulate_var = variance_file.var() else: ( accumulate_count, accumulate_avg, accumulate_var, ) = parallel_variance( accumulate_count, accumulate_avg, accumulate_var, len(variance_file), variance_file.mean(), variance_file.var(), ) accumulate_var = DataFrame(accumulate_var) accumulate_var.columns = accumulate_var.columns.astype(str) accumulate_var.to_parquet(fold_variance / "variance_accumulated.parquet", engine="pyarrow") return accumulate_var else: print("Reading the variance already calculated.") variance = read_parquet(fold_variance / "variance_accumulated.parquet", engine="pyarrow") return variance
def test_boolean_comparison(self): # GH 4576 # boolean comparisons with a tuple/list give unexpected results df = DataFrame(np.arange(6).reshape((3, 2))) b = np.array([2, 2]) b_r = np.atleast_2d([2, 2]) b_c = b_r.T l = (2, 2, 2) tup = tuple(l) # gt expected = DataFrame([[False, False], [False, True], [True, True]]) result = df > b assert_frame_equal(result, expected) result = df.values > b assert_numpy_array_equal(result, expected.values) result = df > l assert_frame_equal(result, expected) result = df > tup assert_frame_equal(result, expected) result = df > b_r assert_frame_equal(result, expected) result = df.values > b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, df.__gt__, b_c) pytest.raises(ValueError, df.values.__gt__, b_c) # == expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == b assert_frame_equal(result, expected) result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected) result = df == b_r assert_frame_equal(result, expected) result = df.values == b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, lambda: df == b_c) assert df.values.shape != b_c.shape # with alignment df = DataFrame(np.arange(6).reshape((3, 2)), columns=list('AB'), index=list('abc')) expected.index = df.index expected.columns = df.columns result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected)
pickle.dump(content_WS_list,fp) fp.close() with open('reference_WS_list.pkl', 'wb') as fp: pickle.dump(reference_WS_list,fp) fp.close() all_list = [] all_list.append(chi_paper_name_WS_list) all_list.append(chi_keyword_WS_list) all_list.append(abstract_WS_list) all_list.append(content_WS_list) all_list.append(reference_WS_list) print(len(all_list)) df = DataFrame(all_list).transpose() df.columns =["chi_paper_name_WS","chi_keyword_WS","abstract_WS","content_WS","reference_WS"] print(df.info()) print(df.head(5)) df.to_csv("all_data_WS.csv") with open('All_WS_list.pkl', 'wb') as fp: pickle.dump(all_list,fp) fp.close() ''' def add_field_to_mongodb(collection_name, chi_paper_name_WS_list, chi_keyword_WS_list, abstract_WS_list, content_WS_list,reference_WS_list): client = MongoClient('localhost', 27017) db = client['110_conference'] collection = db[collection_name] AllFields_array = np.array(All_list) # print(AllFields_list[0][2], AllFields_list[1][2])
# returns one or otherwise None as a tuple print(c.fetchone()) # returns one or otherwise None as a tuple print(c.fetchmany(2)) # returns a list of tuples print(c.fetchall()) # Since now the cursor has read all the rows and we are at End # So again fetching the records from the database c.execute("SELECT * FROM employees") # STEP 5 df = DataFrame(c.fetchall()) # putting the result into Dataframe df.columns = ["id", "first", "last", "pay"] # STEP 6 # commits the current transaction conn.commit() # STEP 7 # closing the connection conn.close() """ Database handling using MySQL on Local Machine """ #use below command in anaconda prompt # pip install mysql-connector-python from pandas import DataFrame
import requests import csv from bs4 import BeautifulSoup import pandas as pd from pandas import Series, DataFrame df = pd.read_csv('gymnasium-names2.csv') #pd.concat([Series(row['gymnasium'], row['fach'].split(',')) # for _, row in df.iterrows()]).reset_index() b = DataFrame(df.fach.str.split(', ').tolist(), index=df.gymnasium).stack() b = b.reset_index()[[0, 'gymnasium']] # var1 variable is currently labeled 0 b.columns = ['fach', 'gymnasium'] # renaming var1 b.to_csv('dim_fach.csv', index=False) b = DataFrame(df.sprachen.str.split(', ').tolist(), index=df.gymnasium).stack() b = b.reset_index()[[0, 'gymnasium']] # var1 variable is currently labeled 0 b.columns = ['sprachen', 'gymnasium'] # renaming var1 b.to_csv('dim_sprachen.csv', index=False)
from flask_sqlalchemy import SQLAlchemy from datetime import datetime import time from urllib.parse import unquote app = Flask(__name__) app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///mydb.db' db = SQLAlchemy(app) db.create_all() # Load the data from the db # books df books_db = db.session.execute('select * from Books LIMIT 1000') books = DataFrame(books_db.fetchall()) books.columns = books_db.keys() #print(books.head()) #print(books.shape) # ratings df ratings_db = db.session.execute('select * from ratings LIMIT 1000') ratings = DataFrame(ratings_db.fetchall()) ratings.columns = ratings_db.keys() #print(ratings.head()) #print(ratings.shape) # book_tags df book_tags_db = db.session.execute('select * from book_tags LIMIT 1000') book_tags = DataFrame(book_tags_db.fetchall()) book_tags.columns = book_tags_db.keys() #print(book_tags.head())
def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False) expected = DataFrame(tuples) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] index = MultiIndex.from_tuples(tuples, names=["first", "second"]) result = index.to_frame(index=False) expected = DataFrame(tuples) expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame(tuples) expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) result = index.to_frame(name=["first", "second"]) expected.index = index expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) msg = "'name' must be a list / sequence of column names." with pytest.raises(TypeError, match=msg): index.to_frame(name="first") msg = "'name' should have same length as number of levels on index." with pytest.raises(ValueError, match=msg): index.to_frame(name=["first"]) # Tests for datetime index index = MultiIndex.from_product( [range(5), pd.date_range("20130101", periods=3)]) result = index.to_frame(index=False) expected = DataFrame({ 0: np.repeat(np.arange(5, dtype="int64"), 3), 1: np.tile(pd.date_range("20130101", periods=3), 5), }) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame({ "first": np.repeat(np.arange(5, dtype="int64"), 3), "second": np.tile(pd.date_range("20130101", periods=3), 5), }) tm.assert_frame_equal(result, expected) result = index.to_frame(name=["first", "second"]) expected.index = index tm.assert_frame_equal(result, expected)
def create_data(): """create the pickle data""" data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = { "timestamp": Timestamp("20130101"), "period": Period("2012", "M") } index = { "int": Index(np.arange(10)), "date": date_range("20130101", periods=10), "period": period_range("2013-01-01", freq="M", periods=10), "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), "timedelta": timedelta_range("00:00:00", freq="30T", periods=10), } index["range"] = RangeIndex(10) index["interval"] = interval_range(0, periods=10) mi = { "reg2": MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ])), names=["first", "second"], ) } series = { "float": Series(data["A"]), "int": Series(data["B"]), "mixed": Series(data["E"]), "ts": Series(np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)), "mi": Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]), ), "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), "cat": Series(Categorical(["foo", "bar", "baz"])), "dt": Series(date_range("20130101", periods=5)), "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")), "period": Series([Period("2000Q1")] * 5), } mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = { "float": DataFrame({ "A": series["float"], "B": series["float"] + 1 }), "int": DataFrame({ "A": series["int"], "B": series["int"] + 1 }), "mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), "mi": DataFrame( { "A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ])), names=["first", "second"], ), ), "dup": DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]), "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}), "cat_and_float": DataFrame({ "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), }), "mixed_dup": mixed_dup_df, "dt_mixed_tzs": DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), "dt_mixed2_tzs": DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), } cat = { "int8": Categorical(list("abcdefg")), "int16": Categorical(np.arange(1000)), "int32": Categorical(np.arange(10000)), } timestamp = { "normal": Timestamp("2011-01-01"), "nat": NaT, "tz": Timestamp("2011-01-01", tz="US/Eastern"), } timestamp["freq"] = Timestamp("2011-01-01", freq="D") timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return { "series": series, "frame": frame, "index": index, "scalars": scalars, "mi": mi, "sp_series": { "float": _create_sp_series(), "ts": _create_sp_tsseries() }, "sp_frame": { "float": _create_sp_frame() }, "cat": cat, "timestamp": timestamp, "offsets": off, }
import pandas as pd from pandas import DataFrame csvfile = pd.read_csv('./cloth_shop.csv', header=None) df = DataFrame(csvfile) df.columns = [ 'Name', 'Age', 'Weight', 'm0006', 'm0612', 'm1218', 'f0006', 'f0612', 'f1218' ] ##删除全空的行 df.dropna(inplace=True, how='all') ##最高频值填充 age_maxf = df['Age'].value_counts().index[0] df['Age'].fillna(age_maxf, inplace=True) #拆分name df[['First_Name', 'Last_Name']] = df['Name'].str.split(expand=True) #df.insert(0,['First_Name','Last_Name'],df['Name'].str.split(expand=True)) df.drop('Name', axis=1, inplace=True) # 删除非 ASCII 字符 df[['First_Name', 'Last_Name']].replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True) df.drop_duplicates(['First_Name', 'Last_Name'], inplace=True) # 获取 weight 数据列中单位为 lbs 的数据 rows_with_lbs = df['Weight'].str.contains('lbs').fillna(False)
# -*- coding: utf-8 -*- """ Created on Thu May 16 16:03:57 2019 @author: abhin """ import sqlite3 from pandas import DataFrame coonect1 = sqlite3.connect("univercity.db") curser1 = coonect1.cursor() curser1.execute("""CREATE TABLE univercity( Student_Name TEXT, Student_Age INTEGER, Student_Roll_no INTEGER, Student_Branch TEXT )""") curser1.execute("INSERT INTO univercity VALUES ('abhi',20,1,'cse')") curser1.execute("INSERT INTO univercity VALUES ('abhi2',21,2,'cse')") curser1.execute("INSERT INTO univercity VALUES ('abhi3',23,3,'cse')") curser1.execute("INSERT INTO univercity VALUES ('abhi4',25,1,'it')") curser1.execute("SELECT * FROM univercity") df = DataFrame(curser1.fetchall()) df.columns = [ "Student_Name", "Student_Age", "Student_Roll_no", "Student_Branch" ] coonect1.commit() coonect1.close()
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " not in res[-1]) df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1])) self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum()) df_object = pd.DataFrame({'a': ['a']}) self.assertTrue(df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes # Ensure df size is as expected df_size = df.memory_usage().sum() exp_size = (len(dtypes) + 1) * n * 8 # (cols + index) * rows * bytes self.assertEqual(df_size, exp_size) # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default self.assertEqual(size_df, np.size(df.memory_usage())) # assert deep works only on object self.assertEqual(df.memory_usage().sum(), df.memory_usage(deep=True).sum()) # test for validity DataFrame(1, index=['a'], columns=['A'] ).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A'] ).index.nbytes df = DataFrame( data=1, index=pd.MultiIndex.from_product( [['a'], range(1000)]), columns=['A'] ) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100)
def create_data(): """ create the pickle data """ data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) index = dict( int=Index(np.arange(10)), date=date_range("20130101", periods=10), period=period_range("2013-01-01", freq="M", periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range("00:00:00", freq="30T", periods=10), ) index["range"] = RangeIndex(10) if _loose_version >= LooseVersion("0.21"): from pandas import interval_range index["interval"] = interval_range(0, periods=10) mi = dict( reg2=MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] ) ), names=["first", "second"], ) ) series = dict( float=Series(data["A"]), int=Series(data["B"]), mixed=Series(data["E"]), ts=Series( np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) ), mi=Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples( tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] ), ), dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), cat=Series(Categorical(["foo", "bar", "baz"])), dt=Series(date_range("20130101", periods=5)), dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), period=Series([Period("2000Q1")] * 5), ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict( float=DataFrame({"A": series["float"], "B": series["float"] + 1}), int=DataFrame({"A": series["int"], "B": series["int"] + 1}), mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), mi=DataFrame( {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, index=MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ] ) ), names=["first", "second"], ), ), dup=DataFrame( np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] ), cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), cat_and_float=DataFrame( { "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), } ), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), dt_mixed2_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), ) cat = dict( int8=Categorical(list("abcdefg")), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000)), ) timestamp = dict( normal=Timestamp("2011-01-01"), nat=NaT, tz=Timestamp("2011-01-01", tz="US/Eastern"), ) timestamp["freq"] = Timestamp("2011-01-01", freq="D") timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return dict( series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off, )
def _read_one_data(self, url, params): """ read one data from specified symbol """ symbol = params['symbol'] del params['symbol'] url = url.format(symbol) resp = self._get_response(url, params=params) ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);' try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) data = j['context']['dispatcher']['stores']['HistoricalPriceStore'] except KeyError: msg = 'No data fetched for symbol {} using {}' raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) # price data prices = DataFrame(data['prices']) prices.columns = [col.capitalize() for col in prices.columns] prices['Date'] = to_datetime( to_datetime(prices['Date'], unit='s').dt.date) if 'Data' in prices.columns: prices = prices[prices['Data'].isnull()] prices = prices[[ 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose' ]] prices = prices.rename(columns={'Adjclose': 'Adj Close'}) prices = prices.set_index('Date') prices = prices.sort_index().dropna(how='all') if self.ret_index: prices['Ret_Index'] = \ _calc_return_index(prices['Adj Close']) if self.adjust_price: prices = _adjust_prices(prices) # dividends & splits data if self.get_actions and data['eventsData']: actions = DataFrame(data['eventsData']) actions.columns = [col.capitalize() for col in actions.columns] actions['Date'] = to_datetime( to_datetime(actions['Date'], unit='s').dt.date) types = actions['Type'].unique() if 'DIVIDEND' in types: divs = actions[actions.Type == 'DIVIDEND'].copy() divs = divs[['Date', 'Amount']].reset_index(drop=True) divs = divs.set_index('Date') divs = divs.rename(columns={'Amount': 'Dividends'}) prices = prices.join(divs, how='outer') if 'SPLIT' in types: splits = actions[actions.Type == 'SPLIT'].copy() splits['SplitRatio'] = splits.apply( lambda row: eval(row['Splitratio']) if float(row['Numerator']) > 0 else 1, axis=1) splits = splits.reset_index(drop=True) splits = splits.set_index('Date') splits['Splits'] = splits['SplitRatio'] prices = prices.join(splits['Splits'], how='outer') if 'DIVIDEND' in types and self.adjust_dividends: # Adjust dividends to deal with splits adj = prices['Splits'].sort_index( ascending=False).fillna(1).cumprod() prices['Dividends'] = prices['Dividends'] * adj return prices
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = [ 'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool' ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert "memory usage: " in res[-1] # do not display memory usage case df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() assert "memory usage: " not in res[-1] df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB assert re.match(r"memory usage: [^+]+\+", res[-1]) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate assert not re.match(r"memory usage: [^+]+\+", res[-1]) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes assert df_size == exp_size # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default assert size_df == np.size(df.memory_usage()) # assert deep works only on object assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A']).index.nbytes df = DataFrame(data=1, index=pd.MultiIndex.from_product([['a'], range(1000)]), columns=['A']) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes mem = df.memory_usage(deep=True).sum() assert mem > 0
def train_old_net(): learning_rate = 0.001 L1_reg = 0.00 L2_reg = 0.0001 n_epochs = 100 batch_size = 20 n_hidden = 500 datasets = load_data(path) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x = datasets[2] #compute number of minibatches n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print '...building the model' index = T.lscalar() x = T.matrix('x') y = T.ivector('y') rng = np.random.RandomState(1234567890) #construct the MLP class #Attention!!! #this line to set p_drop_perceptron and p_drop_logistic #if set no dropout then decrease the early stop threshold #improvement_threshold on line 292 classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, p_drop_perceptron=0, p_drop_logistic=0) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = classifier.negative_log_likelihood( y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr #compiling a theano function that computes the mistake rate that #made by the validate_set on minibatch validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) #symbolicly compute the gradient of cost respect to params #the resulting gradient will be stored in list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) #using RMSprop(scaling the gradient based on running average) #to update the parameters of the model as a list of (variable,update expression) pairs def RMSprop(gparams, params, learning_rate, rho=0.9, epsilon=1e-6): """ param:rho,the fraction we keep the previous gradient contribution """ updates = [] for p, g in zip(params, gparams): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - learning_rate * g)) return updates #compiling a Theano function 'train_model' that returns the cost #but in the same time updates the parameter of the model based on #the rules defined in 'updates' train_model = theano.function( inputs=[index], outputs=cost, updates=RMSprop(gparams, classifier.params, learning_rate=0.001), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############## #Train Model## ############## print '...training' #early-stopping parameters patience = 10000 #look as this many examples regardless patience_increase = 2 #wait the iter number longer when a new best is found #improvement_threshold=0.995 # a relative improvement of this much on validation set # considered as not overfitting # if have added drop-out noise,we can increase the value improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) # every this much interval check on the validation set # to see if the net is overfitting. # patience/2 because we want to at least check twice before getting the patience # include n_train_batches to ensure we at least check on every epoch best_validation_error_rate = np.inf epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) #iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: #validation validation_error_rate = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_error_rate = np.mean(validation_error_rate) print('epoch %i,validation error %f %%' % (epoch, this_validation_error_rate * 100.)) #if we got the best validation score until now if this_validation_error_rate < best_validation_error_rate: #improve the patience if error rate is good enough if this_validation_error_rate < best_validation_error_rate * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_error_rate = this_validation_error_rate if patience <= iter: done_looping = True break ########################################### # Predict with trained parameters(nonoise)# ########################################### classifier.p_drop_perceptron = 0 classifier.p_drop_logistic = 0 y_x = classifier.predict() model_predict = theano.function( inputs=[index], outputs=y_x, givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]}) digit_preds = Series( np.concatenate([model_predict(i) for i in xrange(n_test_batches)])) image_ids = Series(np.arange(1, len(digit_preds) + 1)) submission = DataFrame([image_ids, digit_preds]).T submission.columns = ['ImageId', 'Label'] submission.to_csv(path + 'submission_sample.csv', index=False)
def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result
def get_variance_by_person(path_dataset, range_=(1, 11)): """Calculate the variance by person.""" fold_variance = Path(path_dataset) / "variance_person" if not check_exist(path_dataset, "variance_person"): print("Loading the files to calculate variance.") filterwarnings("ignore") var_pearson = [] selected_channels = [ "time", "FP1-F7", "F7-T7", "T7-P7", "P7-O1", "FP1-F3", "F3-C3", "C3-P3", "P3-O1", "FP2-F4", "F4-C4", "C4-P4", "P4-O2", "FP2-F8", "F8-T8", "T8-P8-0", "P8-O2", "FZ-CZ", "CZ-PZ", "P7-T7", "T7-FT9", "FT9-FT10", "FT10-T8", "T8-P8-1", ] for id_patient in tqdm_notebook(range(range_[0], range_[1]), desc="Patient"): accumulate_count = 0 accumulate_avg = 0 accumulate_var = 0 path_files = join(path_dataset, "chb{0:0=2d}/*.edf".format(id_patient)) files_in_folder = glob(path_files) for enum, file in enumerate( tqdm_notebook(files_in_folder, desc="Files", leave=False)): variance_file = read_raw_edf(input_fname=file, verbose=0).to_data_frame( picks=["eeg"], time_format="ms") # Removing channels that are not present in all files. variance_file = variance_file[ variance_file.columns.intersection(selected_channels)] # Sorting the channels variance_file.sort_index(axis=1, inplace=True) if enum == 0: accumulate_count = len(variance_file) accumulate_avg = variance_file.mean() accumulate_var = variance_file.var() else: ( accumulate_count, accumulate_avg, accumulate_var, ) = parallel_variance( accumulate_count, accumulate_avg, accumulate_var, len(variance_file), variance_file.mean(), variance_file.var(), ) var_pearson.append(accumulate_var) variance_df = DataFrame(var_pearson).drop("time", 1) variance = DataFrame([ file.sort_index().sort_values().index[-1] for ind, file in variance_df.iterrows() ]) variance.columns = variance.columns.astype(str) variance.to_parquet(fold_variance / "variance_person.parquet", engine="pyarrow") return variance else: print("Reading the variance already calculated.") variance = read_parquet(fold_variance / "variance_person.parquet", engine="pyarrow") return variance
logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) logreg.score(X_train, Y_train) # In[ ]: random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) # In[ ]: coeff_df = DataFrame(titanic_df.columns.delete(0)) coeff_df.columns = ['Features'] coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0]) # preview coeff_df
def extract(self): print('\nExtracting Glofas Data\n') files = [ f for f in listdir(self.inputPath) if isfile(join(self.inputPath, f)) and f.endswith('.nc') ] df_thresholds = DataFrame(self.GLOFAS_STATIONS) df_thresholds.columns = self.glofas_cols df_thresholds = df_thresholds.set_index("station_code", drop=False) df_district_mapping = DataFrame(self.DISTRICT_MAPPING) df_district_mapping.columns = self.district_cols df_district_mapping = df_district_mapping.set_index( "station_code_7day", drop=False) stations = [] trigger_per_day = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, } for i in range(0, len(files)): logging.info("Extracting glofas data from %s", i) Filename = os.path.join(self.inputPath, files[i]) station = {} station['code'] = files[i].split('_')[2] data = xr.open_dataset(Filename) # Get threshold for this specific station if station['code'] in df_thresholds['station_code'] and station[ 'code'] in df_district_mapping['station_code_7day']: print(Filename) threshold = df_thresholds[df_thresholds.station_code == station['code']]['trigger_level'][0] # Set dimension-values time = 0 for step in range(1, 8): # Loop through 51 ensembles, get forecast (for 3 or 7 day) and compare to threshold ensemble_options = 51 count = 0 dis_sum = 0 for ensemble in range(0, ensemble_options): discharge = data['dis'].sel(ensemble=ensemble, step=step).values[time][0] # DUMMY OVERWRITE DEPENDING ON COUNTRY SETTING if SETTINGS[ self.country_code]['dummy_trigger'] == True: if step < 5: discharge = 0 elif station[ 'code'] == 'G1361': # ZMB dummy flood station 1 discharge = 8000 elif station[ 'code'] == 'G1328': # ZMB dummy flood station 2 discharge = 9000 elif station[ 'code'] == 'G5200': # UGA dummy flood station discharge = 700 elif station[ 'code'] == 'G1067': # ETH dummy flood station discharge = 1000 elif station[ 'code'] == 'G1904': # ETH dummy flood station discharge = 2000 elif station[ 'code'] == 'G5194': # KEN dummy flood station discharge = 2000 else: discharge = 0 if discharge >= threshold: count = count + 1 dis_sum = dis_sum + discharge prob = count / ensemble_options dis_avg = dis_sum / ensemble_options station['fc_' + self.fcStep] = dis_avg station['fc_' + self.fcStep + '_prob'] = prob station['fc_' + self.fcStep + '_trigger'] = 1 if prob > TRIGGER_LEVELS[ 'minimum'] else 0 if station['fc_' + self.fcStep + '_trigger'] == 1: trigger_per_day[step] = 1 if step == self.days: stations.append(station) station = {} station['code'] = files[i].split('_')[2] data.close() # Add 'no_station' and all currently unavailable glofas-stations manually for now for station_code in [ 'no_station' ]: #,'F0043','F0044','F0045','F0046','F0047','F0048','F0049','F0050','F0051','F0052','F0053','F0054','F0055','F0056','G5696']: station = {} station['code'] = station_code station['fc_' + self.fcStep] = 0 station['fc_' + self.fcStep + '_prob'] = 0 station['fc_' + self.fcStep + '_trigger'] = 0 stations.append(station) with open(self.extractedGlofasPath, 'w') as fp: json.dump(stations, fp) print('Extracted Glofas data - File saved') with open(self.triggerPerDay, 'w') as fp: json.dump([trigger_per_day], fp) print('Extracted Glofas data - Trigger per day File saved')
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat([ df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index) ], axis=1) result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected)
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( {'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected)
position1_weight=1,position2_weight=0.9, position3_weight=0.8, context_word_count_weight=1,topic_weight=1) for topic_index in range(0,total_topics_num):#Save the topic terms and their context keywords of this cluster in the dataframe topic_keywords_num=len(topics_keywords_context_weights[topic_index]) #If the number of topic terms in one theme is less than the displaying number of topic terms which is setted before, then make it up with "None". if topic_keywords_num<topic_feature_display_num: for add_index in range(0,topic_feature_display_num-topic_keywords_num): topics_keywords_context_weights[topic_index].append('None') nmf_topic_detail_list[topic_index].append(('None',0)) cluster_topic_df[str(cluster_index+1)+str(topic_index)]=\ [word for (word,weight) in nmf_topic_detail_list[topic_index]] cluster_topic_with_context[str(cluster_index+1)+str(topic_index)]=\ topics_keywords_context_weights[topic_index] #Generate the two layer column names of dataframe clusters_name_columns=[] topics_name_columns=[] for cluster_index in range(1,clusters_num+1): for topic_index in range(1,total_topics_num+1): clusters_name_columns.append(str(cluster_index)) topics_name_columns.append(chr(64+topic_index)) cluster_topic_with_context.columns=[clusters_name_columns,topics_name_columns] cluster_topic_with_context.columns.names=['clusters','topics'] cluster_topic_df.columns=[clusters_name_columns,topics_name_columns] cluster_topic_df.columns.names=['clusters','topics'] cluster_topic_with_context.to_csv('topics_context_'+file_name+'_'+ topic_model_name+'.csv') cluster_topic_df.to_csv(file_name+'_'+topic_model_name+'.csv')
import sqlalchemy from pandas import DataFrame engine = sqlalchemy.create_engine("mysql://root:@localhost/data science") query = "select * from job_satisfaction" #query Database resoverall = engine.execute(query) #execute Query df = DataFrame(resoverall.fetchall()) #putting the result into Dataframe df.columns = resoverall.keys( ) #Setting the Column names as it was in database. from pandas import DataFrame import mysql.connector # connect to MySQL server along with Database name conn = mysql.connector.connect(user='******', password='', host='localhost', database='job_satisfaction') # Creating cursor Object from connection object cursor = conn.cursor() query = ("SELECT * FROM job_satisfaction;") # query Database cursor.execute(query) # execute Query df = DataFrame(cursor.fetchall()) # putting the result into Dataframe df.columns = cursor.column_names # Setting the Column names as it was in database.
import arff from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from pandas import DataFrame import pickle data = arff.load(open('./OffComBR3.arff')) df = DataFrame(data['data']) df.columns = ['hate', 'sentence'] df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0) X = df['sentence'].tolist() y = df['hate'].tolist() cl = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 4))), ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0))]) cl.fit(X, y) cl_filename = 'randomforest.sav' df_filename = 'data.sav' f = open(cl_filename, 'wb') pickle.dump(cl, f) f.close()
def pseudobulk(adata, outpath = None, column = 'celltype0', label = 'celltype0', split_condition = 'donor', todrop =['CELL','input.path','percent_mito','n_counts','n_genes','leiden','celltype0','celltype1','celltype2','celltype3','dblabel'], main_condition='CONDITION'): """export pseudobulk profiles of cells to .gct files This is a function with which any type of labeling (i.e. celltype annotation, louvain clustering, etc.) can be written out to several .gct files as well as a single metadata file. To ensure FAIR compatbility label, and file name should not be changed. parameters ---------- adata: `AnnData` the AnnData object containing the labeling outpath `str` | default = current working directory filepath to the directory in which the results should be outputed, if no directory is specified it outputs the results to the current working directory. column: `str` | default = 'celltype0' Name of the column in adata.obs that is to be mapped to cell barcodes and written out to file. label: `str` | default = 'celltype0' label above the column when it is written out to several files split_condition: `str` | default = 'experiment' the experimental unit, e.g. sample ID todrop: `list` Several column headers to be excluded from metadata main_condition: `str` | default = 'CONDITION' main condition to be outputed in the metadata file returns ------- dfmerge: `pd.DataFrame` merged dataframe """ if outpath is None: outpath = os.getcwd() data = adata.obs.get(column) if data is None: sys.exit('please specify a column name that is present in adata.obs') data = adata.obs.get(column).to_frame(name=label) data = adata.obs.get(main_condition) if data is None: sys.exit('please specify a condition name that is present in adata.obs') ### check if the outdir exists if not create if not os.path.exists(outpath): os.makedirs(outpath) ### create adata subsets for each column value adata.obs[split_condition]=adata.obs[split_condition].astype('str') adata.obs[split_condition]=adata.obs[split_condition].astype('category') adata.obs[column]=adata.obs[column].astype('category') bulks={} myset=list(set(adata.obs[column])) for i in myset: ii=i.replace(" ", "_") ## to avoid spaces in cell names bulks[ii]=adata[adata.obs[column].isin([i])].copy() bulks['all']=adata.copy() ### go through each adata subset and export pseudobulk dfbulks={} for x in bulks.keys(): # sum expression auxdata=bulks[x].copy() myexp=list(auxdata.obs[split_condition].cat.categories) ### these are all different levels for experiments mysums=zeros((len(auxdata.raw.var.index),len(myexp))) for i in range(len(myexp)): mysums[:,i]=expm1(auxdata[auxdata.obs[split_condition]==myexp[i]].raw.X).sum(axis=0) mysums=DataFrame(mysums) mysums.index=adata.raw.var.index mysums.columns=[x+'.'+y for y in myexp] dfbulks[x]=mysums mydat = auxdata.raw.var.loc[:,['SYMBOL', 'ENSEMBL']] mydat.rename(columns={'SYMBOL':'Description'}, inplace=True) gct = mydat.merge(dfbulks[x], how='right', left_index=True, right_index=True) gct.set_index('ENSEMBL', inplace=True) gct.index.names = ['NAME'] gct.columns=['Description']+myexp #write out average expression gctFile_pseudo = outpath+ 'Pseudobulk-'+label+'-'+x+'.gct' with open (gctFile_pseudo,"w") as fp: fp.write("#1.2"+"\n") fp.write(str(gct.shape[0])+'\t'+str(gct.shape[1] - 1)+'\n') # "description" already merged in as a column fp.close() #...and then the matrix gct.to_csv(gctFile_pseudo, sep = '\t', index=True, index_label='NAME', header=True, mode = 'a', float_format='%.3f') print('Pseudobulk-'+label+'-'+x+'.gct exported successfully to file') #### Output into single .tsv file dfmerge=concat(dfbulks,axis=1) dfmerge.columns = dfmerge.columns.droplevel() dfmerge.to_csv(outpath+ 'Pseudobulk-'+label+'.tsv',sep='\t',index_label=False) ### Export one metadata file myexp=list(adata.obs[split_condition].cat.categories) colindex=range(0,len(adata.obs.columns)) ### replace if only a subset of metadata should be used mysums=[] for i in range(len(myexp)): mysums.append(list(adata[adata.obs[split_condition]==myexp[i]].obs.iloc[:,colindex].iloc[0,:])) mysums=DataFrame(mysums).transpose() mysums.index=adata[adata.obs[split_condition]==myexp[i]].obs.iloc[:,colindex].columns mysums.columns=myexp mysums=mysums.transpose().drop(labels=todrop,axis=1,errors='ignore') mysums['ID']=list(mysums.index) colorder = ['ID',main_condition] + (mysums.columns.drop(['ID',main_condition]).tolist()) mysums.loc[:,colorder].to_csv(outpath+ 'Pseudobulk.meta',sep='\t',index=False) return(dfmerge) sys.exit(0)
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import sklearn import numpy as np import pandas as pd from pandas import DataFrame, Series import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import load_boston boston = load_boston() boston_df = DataFrame(boston.data) boston_df.columns = boston.feature_names boston_df['Price'] = boston.target X_multi = boston_df.drop('Price', axis=1) X_train, X_test, Y_train, Y_test = train_test_split(X_multi, boston_df.Price) lreg = LinearRegression() pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) train = plt.scatter(pred_train, (pred_train - Y_train), c='b', alpha=0.5) test = plt.scatter(pred_test, (pred_test - Y_test), c='r', alpha=0.5) plt.hlines(y=0, xmin=1.0, xmax=50)
def flatten( df: pd.DataFrame, reset_index: bool = True, drop_levels: Union[Sequence[int], Sequence[str]] = (), ) -> pd.DataFrame: """ Convert N-dimensional DataFrame to a flat DataFrame :param df: N-dimensional DataFrame. :param reset_index: Convert index to column when df.index isn't RangeIndex :param drop_levels: index of level or names of level might be dropped if df is N-dimensional :return: a flat DataFrame Examples ----------- Convert DatetimeIndex into columns. >>> index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03",]) >>> index.name = "__timestamp" >>> df = pd.DataFrame(index=index, data={"metric": [1, 2, 3]}) >>> df metric __timestamp 2021-01-01 1 2021-01-02 2 2021-01-03 3 >>> df = flatten(df) >>> df __timestamp metric 0 2021-01-01 1 1 2021-01-02 2 2 2021-01-03 3 Convert DatetimeIndex and MultipleIndex into columns >>> iterables = [["foo", "bar"], ["one", "two"]] >>> columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"]) >>> df = pd.DataFrame(index=index, columns=columns, data=1) >>> df level1 foo bar level2 one two one two __timestamp 2021-01-01 1 1 1 1 2021-01-02 1 1 1 1 2021-01-03 1 1 1 1 >>> flatten(df) __timestamp foo, one foo, two bar, one bar, two 0 2021-01-01 1 1 1 1 1 2021-01-02 1 1 1 1 2 2021-01-03 1 1 1 1 """ if _is_multi_index_on_columns(df): df.columns = df.columns.droplevel(drop_levels) _columns = [] for series in df.columns.to_flat_index(): _cells = [] for cell in series if is_sequence(series) else [series]: if pd.notnull(cell): # every cell should be converted to string _cells.append(str(cell)) _columns.append(FLAT_COLUMN_SEPARATOR.join(_cells)) df.columns = _columns if reset_index and not isinstance(df.index, pd.RangeIndex): df = df.reset_index(level=0) return df