def test_info_duplicate_columns(self): io = StringIO() # it works! frame = DataFrame(np.random.randn(1500, 4), columns=['a', 'a', 'b', 'b']) frame.info(buf=io)
def test_survey_simulation(): year = 2011 input_data_frame = get_input_data_frame(year) tax_benefit_system_class = openfisca_france.FranceTaxBenefitSystem() survey_scenario = SurveyScenario().init_from_data_frame( input_data_frame = input_data_frame, tax_benefit_system_class = tax_benefit_system_class, year = year, ) simulation = survey_scenario.new_simulation() try: from pandas import DataFrame revenu_disponible = DataFrame({"revenu_disponible": simulation.calculate('revenu_disponible')}) except NaNCreationError as error: index = error.index entity = error.entity column_name = error.column_name input_data_frame_debug = filter_input_data_frame( simulation.input_data_frame, entity, index[:10], ) survey_scenario_debug = SurveyScenario() simulation_debug = survey_scenario_debug.new_simulation( debug = True, input_data_frame = input_data_frame_debug, tax_benefit_system_class = tax_benefit_system_class, year = year, ) simulation_debug.calculate(column_name) print revenu_disponible.info() print 'finished'
def test_info(self): io = StringIO() self.frame.info(buf=io) self.tsframe.info(buf=io) frame = DataFrame(np.random.randn(5, 3)) frame.info() frame.info(verbose=False)
def log_dataframe_info(df: pd.DataFrame): """ Log the StringIO buffer from pd.DataFrame.info() method """ logger = logging.getLogger() buf = StringIO() df.info(buf=buf) stream = buf.getvalue().encode().decode('utf-8') logger.debug("\n\n```\n{}\n```\n".format(stream))
def test_info(float_frame, datetime_frame): io = StringIO() float_frame.info(buf=io) datetime_frame.info(buf=io) frame = DataFrame(np.random.randn(5, 3)) frame.info() frame.info(verbose=False)
def read_spss(spss_file_path): with SavReader(spss_file_path, returnHeader=True) as reader: for record in reader: print record #records_got.append(record) data_frame = DataFrame(list(s.SavReader(spss_file_path))) print data_frame.info() return data_frame
class GetDtypeCounts(object): # 2807 def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() def time_info(self): self.df.info()
def storeTransformedData(fpath_save, ds, **kw): if 'header' not in kw: kw['header'] = None if 'index' not in kw: kw['index'] = False print("Save transformed data to file \"%s\" which has shape %s...\n" % (fpath_save, ds.shape), ds) df = DataFrame(data = ds) df.info() df.to_csv(path_or_buf=fpath_save, **kw)
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() self.assertEqual("a 1 non-null int64\n", lines[3]) self.assertEqual("a 1 non-null float64\n", lines[4])
def test_info_empty(): df = DataFrame() buf = StringIO() df.info(buf=buf) result = buf.getvalue() expected = textwrap.dedent("""\ <class 'pandas.core.frame.DataFrame'> Index: 0 entries Empty DataFrame""") assert result == expected
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=['a', 'a']) frame.info(buf=io) io.seek(0) lines = io.readlines() self.assertEqual('a 1 non-null int64\n', lines[3]) self.assertEqual('a 1 non-null float64\n', lines[4])
def logDataframeInfo(df: pd.DataFrame, dfName: str, callerName: str, logger: logging.Logger): """ lofDataframeInfo logs the info of a dataframe from log level DEBUG """ buf = io.StringIO() df.info(buf=buf) logger.debug('{func:s}: {name:s} info = {info!s}'.format( func=callerName, name=dfName, info=buf.getvalue())) buf.truncate(0)
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() assert " 0 a 1 non-null int64 \n" == lines[5] assert " 1 a 1 non-null float64\n" == lines[6]
class GetDtypeCounts: # 2807 def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): with warnings.catch_warnings(record=True): self.df.dtypes.value_counts() def time_info(self): self.df.info()
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=['a', 'a']) frame.info(buf=io) io.seek(0) lines = io.readlines() assert 'a 1 non-null int64\n' == lines[3] assert 'a 1 non-null float64\n' == lines[4]
def test_info(self): io = StringIO() self.frame.info(buf=io) self.tsframe.info(buf=io) frame = DataFrame(np.random.randn(5, 3)) import sys sys.stdout = StringIO() frame.info() frame.info(verbose=False) sys.stdout = sys.__stdout__
def read_spss(spss_file_path): print spss_file_path with SavReader(spss_file_path, returnHeader=True) as reader: for record in reader: print record #records_got.append(record) data_frame = DataFrame(list(s.SavReader(spss_file_path))) print data_frame.info() return data_frame
def test_info_default_verbose_selection(num_columns, max_info_columns, verbose): frame = DataFrame(np.random.randn(5, num_columns)) with option_context("display.max_info_columns", max_info_columns): io_default = StringIO() frame.info(buf=io_default) result = io_default.getvalue() io_explicit = StringIO() frame.info(buf=io_explicit, verbose=verbose) expected = io_explicit.getvalue() assert result == expected
def test_to_string_unicode_columns(self): df = DataFrame({u'\u03c3': np.arange(10.)}) buf = StringIO() df.to_string(buf=buf) buf.getvalue() buf = StringIO() df.info(buf=buf) buf.getvalue() result = self.frame.to_string(force_unicode=True) self.assert_(isinstance(result, unicode))
def test_to_string_unicode_columns(self): df = DataFrame({u'\u03c3' : np.arange(10.)}) buf = StringIO() df.to_string(buf=buf) buf.getvalue() buf = StringIO() df.info(buf=buf) buf.getvalue() result = self.frame.to_string() self.assert_(isinstance(result, unicode))
def test_info_shows_column_dtypes(self): dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = "%d %d non-null %s" % (i, n, dtype) assert name in res
def test_to_string_unicode_columns(float_frame): df = DataFrame({"\u03c3": np.arange(10.0)}) buf = StringIO() df.to_string(buf=buf) buf.getvalue() buf = StringIO() df.info(buf=buf) buf.getvalue() result = float_frame.to_string() assert isinstance(result, str)
def test_info_categorical_column_smoke_test(): n = 2500 df = DataFrame({"int64": np.random.randint(100, size=n)}) df["category"] = Series( np.array(list("abcdefghij")).take(np.random.randint( 0, 10, size=n))).astype("category") df.isna() buf = StringIO() df.info(buf=buf) df2 = df[df["category"] == "d"] buf = StringIO() df2.info(buf=buf)
def test_info_categorical_column(self): # make sure it works n = 2500 df = DataFrame({'int64': np.random.randint(100, size=n)}) df['category'] = Series(np.array(list('abcdefghij')).take( np.random.randint(0, 10, size=n))).astype('category') df.isna() buf = StringIO() df.info(buf=buf) df2 = df[df['category'] == 'd'] buf = StringIO() df2.info(buf=buf)
def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list('ab'), index=[1, 2, 3]) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=list('ABC')) df.info(buf=buf) assert '+' in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), range(3)])) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), ['foo', 'bar']])) df.info(buf=buf) assert '+' in buf.getvalue()
def process_content_info(content: pd.DataFrame): content_info = StringIO() content.info(buf=content_info) str_ = content_info.getvalue() lines = str_.split("\n") table = StringIO("\n".join(lines[3:-3])) datatypes = pd.read_table(table, delim_whitespace=True, names=["column", "count", "null", "dtype"]) datatypes.set_index("column", inplace=True) info = "\n".join(lines[0:2] + lines[-2:-1]) return info, datatypes
def analyze_dataframe(content: pd.DataFrame): content_info = io.StringIO() content.info(buf=content_info) str_ = content_info.getvalue() lines = str_.split("\n") table = StringIO("\n".join(lines[3:-3])) datatypes = pd.read_table( table, delim_whitespace=True, names=["#", "column", "Non-Null", "Count", "Dtype"]) datatypes.set_index("#", inplace=True) info = "\n".join(lines[0:2] + lines[-2:-1]) return datatypes
def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list('ab'), index=[1, 2, 3]) df.info(buf=buf) self.assertFalse('+' in buf.getvalue()) buf = StringIO() df = DataFrame(1, columns=list('ab'), index=list('ABC')) df.info(buf=buf) self.assertTrue('+' in buf.getvalue()) buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product([range(3), range(3)])) df.info(buf=buf) self.assertFalse('+' in buf.getvalue()) buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), ['foo', 'bar']])) df.info(buf=buf) self.assertTrue('+' in buf.getvalue())
def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_)
def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list("ab"), index=list("ABC")) df.info(buf=buf) assert "+" in buf.getvalue() buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=pd.MultiIndex.from_product([range(3), range(3)]), ) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), ) df.info(buf=buf) assert "+" in buf.getvalue()
def test_info_shows_column_dtypes(self): dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = '%d %d non-null %s' % (i, n, dtype) assert name in res
def test_info_max_cols(): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ for len_, verbose in [(12, None), (5, False), (12, True)]: # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ # setting wouldn't truncate with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() assert len(res.strip().split("\n")) == len_
def test_info_verbose_with_counts_spacing(size, header_exp, separator_exp, first_line_exp, last_line_exp): """Test header column, spacer, first line and last line in verbose mode.""" frame = DataFrame(np.random.randn(3, size)) with StringIO() as buf: frame.info(verbose=True, show_counts=True, buf=buf) all_lines = buf.getvalue().splitlines() # Here table would contain only header, separator and table lines # dframe repr, index summary, memory usage and dtypes are excluded table = all_lines[3:-2] header, separator, first_line, *rest, last_line = table assert header == header_exp assert separator == separator_exp assert first_line == first_line_exp assert last_line == last_line_exp
def process_content_info(content: pd.DataFrame): content_info = StringIO() content.info(buf=content_info, null_counts=False) str_ = content_info.getvalue() lines = str_.split("\n") table = StringIO("\n".join(lines[3:-3])) datatypes = pd.read_table(table, delim_whitespace=True, names=["Feature", "dtype"]) datatypes = pd.DataFrame(datatypes) null = np.array(content.isnull().sum()) datatypes['null'] = null return datatypes
def test_survey_simulation(): year = 2011 input_data_frame = get_input_data_frame(year) tax_benefit_system_class = openfisca_france.FranceTaxBenefitSystem() survey_scenario = SurveyScenario().init_from_data_frame( input_data_frame=input_data_frame, tax_benefit_system_class=tax_benefit_system_class, year=year, ) simulation = survey_scenario.new_simulation() try: from pandas import DataFrame revenu_disponible = DataFrame( {"revenu_disponible": simulation.calculate('revenu_disponible')}) except NaNCreationError as error: index = error.index entity = error.entity column_name = error.column_name input_data_frame_debug = filter_input_data_frame( simulation.input_data_frame, entity, index[:10], ) survey_scenario_debug = SurveyScenario() simulation_debug = survey_scenario_debug.new_simulation( debug=True, input_data_frame=input_data_frame_debug, tax_benefit_system_class=tax_benefit_system_class, year=year, ) simulation_debug.calculate(column_name) print(revenu_disponible.info())
def debug_dataframe( df: pd.DataFrame, msg, nrows=5, usecols=None, # **kwargs ): ''' Help diagnose issues with dataframes ''' # pd.set_option('display.max_rows', 200) # pd.set_option('display.max_colwidth', -1) # verbose=True intro = """ === Debug dataframe : {msg} === """ log.debug(intro.format(msg=msg)) log.debug(df.info()) # log.debug(df.columns) log.debug(pp.pformat(df.dtypes)) with pd.option_context('float_format', '{:f}'.format): sdf = df if usecols: sdf = df[usecols] print(sdf.head(nrows, ))
def view_info(df: DataFrame) -> str: """ 查看数据信息 :param df: :return: """ if isinstance(df, DataFrame): _head = df.head() # 前5行数据 _tail = df.tail() # 后5行数据 _columns = df.columns # 查看列名 _index = df.index # 查看索引 _shape = df.shape # 查看形状 _describe = df.describe() # 查看各列数据描述性统计 _df_info = df.info() # 查看缺失及每列数据类型 result = f"{'='*10}前5行数据{'='*10}\n{_head}\n\n" \ f"{'='*10}后5行数据{'='*10}\n{_tail}\n\n" \ f"{'='*10}查看列名{'='*10}\n{_columns}\n\n" \ f"{'='*10}查看索引{'='*10}\n{_index}\n\n" \ f"{'='*10}查看形状{'='*10}\n{_shape}\n\n" \ f"{'='*10}查看各列数据描述性统计{'='*10}\n{_describe}\n\n" \ f"{'='*10}查看缺失及每列数据类型{'='*10}\n{_df_info}\n\n" print("sssss", _df_info) return result else: raise TypeError("df's type is not DataFrame!") return None
def generate_df_descriptive_stats(df: pd.DataFrame): # df.info() buffer = io.StringIO() df.info(buf=buffer) info: str = buffer.getvalue() # df.describe() # NOTE: needs to re-serialise for the np int64 problem describe_json: str = ( df.describe(include="all") .transpose() .reset_index() .rename(columns={"index": "variable"}) .to_json(orient="records") ) describe: List[Dict[str, Any]] = json.loads(describe_json) # combine res = {"info": info, "describe": describe} return res
def test_info_wide(self): from pandas import set_option, reset_option io = StringIO() df = DataFrame(np.random.randn(5, 101)) df.info(buf=io) io = StringIO() df.info(buf=io, max_cols=101) rs = io.getvalue() self.assertTrue(len(rs.splitlines()) > 100) xp = rs set_option('display.max_info_columns', 101) io = StringIO() df.info(buf=io) self.assertEqual(rs, xp) reset_option('display.max_info_columns')
# 4.13.2016 # @totallygloria import json from pandas import DataFrame, Series import pandas as pd import matplotlib.pyplot as plt import numpy as np path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt' records = [json.loads(line) for line in open(path, 'rb')] frame = DataFrame(records) print frame.info() tz_counts = frame['tz'].value_counts() cy_counts = frame['cy'].value_counts() l_counts = frame['l'].value_counts() print tz_counts[:10] # print cy_counts[:20] # print l_counts[:20] clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'TZ Unknown' tz_counts = clean_tz.value_counts()
#plt.show() t = 1 data['predicted_purchases'] = data.apply(lambda r: bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis = 1) print data.sort('predicted_purchases').tail(5) from lifetimes.datasets import load_transaction_data from lifetimes.utils import summary_data_from_transaction_data transaction_data = load_transaction_data() print transaction_data.head() print type(transaction_data) print transaction_data.columns print data.columns print data.head() t = 10 data['predicted_purchases'] = data.apply(lambda r: bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis = 1) print data from pandas import DataFrame d = [{'id': 1, 'R':23, 'F':12, 'M':12.5}, {'id': 2,'R':43, 'F':1, 'M':120.5}, {'id': 3,'R':203, 'F':2, 'M':19.5}] test = DataFrame(d) print test print test.info() print test['R'] ggf = GammaGammaFitter(penalizer_coef=0)
for moment in samples_dicts: if moment < firststart: continue if moment >laststart: break task_usage_moment_df = task_usage_df[(task_usage_df['starttime'] <= moment) & (moment < task_usage_df['endtime'])] # print task_usage_moment_df.info() samples_dicts[moment]['cpu_usage'] += sum(task_usage_moment_df['cpu_usage']) samples_dicts[moment]['mem_usage'] += sum(task_usage_moment_df['mem_usage']) samples_dicts[moment]['disk_io_time'] += sum(task_usage_moment_df['disk_io_time']) samples_dicts[moment]['disk_space'] += sum(task_usage_moment_df['mean_local_disk_space']) samples_dicts[moment]['number_of_running_task'] += len(task_usage_moment_df['cpu_usage']) if (totalreadfile == 50): samples_df = DataFrame(samples_dicts.values()) print samples_df.info() try: samples_df.to_csv(path.join(results_directory,'machine_usage_sampling_machineid_'+str(machine_id)+'_interval_'+str(interval) +'.csv'),index=False) except: print 'khong ghi duoc file csv' totalreadfile = 0 samples_df = DataFrame(samples_dicts.values()) print samples_df.info() try: samples_df.to_csv(path.join(results_directory,'machine_usage_sampling_machineid_'+str(machine_id)+'_interval_'+str(interval) +'.csv'),index=False) except: print 'khong ghi duoc file csv'
# In[71]: submission.head() # In[75]: submission.to_csv('submit1_KMeans.csv',index=False) ##### We have to also convert the solution column to int type from float else a 0 score is obtained # In[82]: submission.info() # In[91]: submission['Solution']=submission['Solution'].astype(int) # In[92]: submission.info() # In[93]: submission['Solution'].value_counts()
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " not in res[-1]) df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1])) self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum()) df_object = pd.DataFrame({'a': ['a']}) self.assertTrue(df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes self.assertEqual(df_size, exp_size) # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default self.assertEqual(size_df, np.size(df.memory_usage())) # assert deep works only on object self.assertEqual(df.memory_usage().sum(), df.memory_usage(deep=True).sum()) # test for validity DataFrame(1, index=['a'], columns=['A'] ).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A'] ).index.nbytes df = DataFrame( data=1, index=pd.MultiIndex.from_product( [['a'], range(1000)]), columns=['A'] ) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100)
random_status = [status[np.randint(low=0,high=len(status))] for i in range(len(rng))] # State pool states = ['GA','FL','fl','NY','NJ','TX'] # Make a random list of states random_states = [states[np.randint(low=0,high=len(states))] for i in range(len(rng))] Output.extend(zip(random_states, random_status, data, rng)) return Output # Now that we have a function to generate our test data, let's create some data and stick it into a dataframe dataset = CreateDataSet(4) df = DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate']) df.info() df.head() # Save results to excel df.to_excel('Lesson3.xlsx', index=False) print 'Done' # Grab data from excel # read_excel? Location = "C:\Users\ABaker\Documents\Python Scripts\Lesson3.xlsx" # Parse a specific sheet df = read_excel(Location, 0, index_col='StatusDate') df.dtypes df.head()
def create_fip(year = None): assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. temporary_store = TemporaryStore.create(file_name = "erfs") replace = create_replace(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) log.info("{}".format(fip.describe())) log.info("{}".format(fip.info())) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) # TODO: check if useful fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité" assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus" # TODO: find a more explicit message # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) log.info(u"longueur fip {}".format(len(fip))) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" pac['naia'] = pac.naia.astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"longueur pacInd1 {}".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"longueur pacInd2 {}".format(len(pac_ind2))) log.info(u"pacInd1 & pacInd2 créés") log.info("{}".format(pac_ind1.duplicated().sum())) log.info("{}".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd))) log.info("{}".format(pac_ind2.type_pac.isnull().sum())) log.info("{}".format(pacInd.type_pac.value_counts())) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident (ménage) for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] # TODO: declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } # TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("{}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
%pylab inline import seaborn import matplotlib.dates as md from matplotlib import pyplot as plt from sklearn import preprocessing from zoo.pipeline.api.keras.layers import Dense, Dropout, LSTM from zoo.pipeline.api.keras.models import Sequential try: dataset_path = "/home/cdsw/nyc_taxi.csv" df = pd.read_csv(dataset_path) except Exception as e: print("nyc_taxi.csv doesn't exist") print("you can run $ANALYTICS_ZOO_HOME/bin/data/NAB/nyc_taxi/get_nyc_taxi.sh to download nyc_taxi.csv") print(df.info()) # check the timestamp format and frequence print(df['timestamp'].head(10)) # check the mean of passenger number print(df['value'].mean()) # change the type of timestamp column for plotting df['datetime'] = pd.to_datetime(df['timestamp']) # visualisation of anomaly throughout time (viz 1) fig, ax = plt.subplots(figsize=(12, 5)) ax.plot(df['datetime'], df['value'], color='blue', linewidth=0.6) ax.set_title('NYC taxi passengers throughout time')
random_names = [ names[random.randint(low=0, high = len(names))] for i in range(1000) ] births = [ random.randint(low=0, high=1000) for i in range(1000) ] #print random_names[:10] #print births[:10] dataset = zip(random_names,births) df = DataFrame(data=dataset, columns=['Names','Births']) #print df[:10] df.to_csv("births1880.txt",index=False,header=False) df = read_csv(r'./births1880.txt',names=["Names","Births"]) print "df.info over all info of df" print df.info() print "df.head - first 5 rows" print df.head() import os os.remove(r'./births1880.txt') uqNames = df['Names'].unique() print "df['names'].unique()" print uqNames print "df.names.describe()" print df['Names'].describe() df = df.groupby("Names") #group by name print df
import numpy as np import pandas as pd from pandas import Series from pandas import DataFrame import extract_2 soup = extract_2.soup_dict data = DataFrame(soup) data = data.T print data.info() print data['source_register'].value_counts() def after_str(data): patterns = ['Planned Sample Size: ', 'UK Sample Size: ', 'More than ', 'Total '] for pattern in patterns: pattern_1 = '('+ pattern + ')([0-9]+)' print data['target_size'][data['target_size'].str.contains(pattern)].tail() temp = data['target_size'][data['target_size'].str.contains(pattern)] temp2 = temp.str.findall(pattern_1).str[0].str[1] for i in temp.index: data['target_size'].ix[i] = temp2.ix[i] def before_str(data): patterns = [' patients to be recruited', ' \(212 by end of recruitment'] for pattern in patterns: pattern_1 = '([0-9]+)('+ pattern + ')' print data['target_size'][data['target_size'].str.contains(pattern)].tail() temp = data['target_size'][data['target_size'].str.contains(pattern)]