def getComptes(year): urlBase = 'http://alize2.finances.gouv.fr/communes/eneuro/detail.php?icom=056&dep=075&type=BPS¶m=5&exercice=' urlToCook = urlBase + str(year) soup = getSoupFromUrl(urlToCook) colEur_p_hab = soup.select( "body > table:nth-of-type(3) tr > td:nth-of-type(2)") colMoy_d_str = soup.select( "body > table:nth-of-type(3) tr > td:nth-of-type(3)") numEur_p_hab = [] numMoy_d_str = [] rowst = 3 rowskip = 2 # careful: there are 2 table headers that are not tr for c1 in colEur_p_hab[rowst:]: if c1.text != '': numEur_p_hab.append(int(c1.text.replace(' ', ''))) for c2 in colMoy_d_str[rowst:]: if c2.text != '': numMoy_d_str.append(int(c2.text.replace(' ', ''))) data = DataFrame({'Eur/hab': numEur_p_hab, 'Moy strate': numMoy_d_str}) rowIds = [i - (rowst + rowskip) for i in [5, 9, 16, 21]] print "Résultats consolidés pour la ville de Paris (exercice " + str( year) + ")" print data.irow(rowIds) return None
def main(): ser = Series(np.arange(3.)) ser2 = Series(np.arange(3.), index=list('abc')) print ser print ser2 print '','' # print ser[-1] print ser2[-1] print '','' print ser.ix[:1] print '','' ser3 = Series(range(3), index=[-5, 1, 3]) print ser3 print '','' print ser3.iget_value(2) print '','' frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1]) print frame print '','' print frame.irow(0) # panel # u'3次元のデータフレーム/パネルの各項目(列)はデータフレーム print '','----------------------' lst = ['AAPL', 'MSFT'] # , 'DELL', 'GOOG' pdata = pd.Panel(dict((stk, pd.io.data.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) for stk in lst)) if not pdata.empty: print pdata pdata = pdata.swapaxes('items', 'minor') print '','' print pdata['Adj Close'] print '','' print pdata.ix[:, '6/1/2012', :] print '','' print pdata.ix['Adj Close', '5/22/2012', :] print '', '' print type(pdata.ix[:, '5/30/2012', :]) # DataFrame if hasattr(pdata.ix[:, '5/30/2012', :], 'to_frame'): stacked = pdata.ix[:, '5/30/2012', :].to_frame() print stacked print '','' print stacked.to_panel() if hasattr(pdata, 'to_frame'): f1 = pdata.to_frame() print f1 print '','' print f1.to_panel() print '',''
def getComptes(year): urlBase = 'http://alize2.finances.gouv.fr/communes/eneuro/detail.php?icom=056&dep=075&type=BPS¶m=5&exercice=' urlToCook = urlBase + str(year) soup = getSoupFromUrl(urlToCook) colEur_p_hab = soup.select("body > table:nth-of-type(3) tr > td:nth-of-type(2)") colMoy_d_str = soup.select("body > table:nth-of-type(3) tr > td:nth-of-type(3)") numEur_p_hab = [] numMoy_d_str = [] rowst = 3 rowskip = 2 # careful: there are 2 table headers that are not tr for c1 in colEur_p_hab[rowst:]: if c1.text!='': numEur_p_hab.append(int(c1.text.replace(' ', '')) ) for c2 in colMoy_d_str[rowst:]: if c2.text!='': numMoy_d_str.append(int(c2.text.replace(' ', '')) ) data = DataFrame({'Eur/hab': numEur_p_hab,'Moy strate':numMoy_d_str}) rowIds=[i-(rowst+rowskip) for i in [5,9,16,21]] print "Résultats consolidés pour la ville de Paris (exercice "+str(year)+")" print data.irow(rowIds)
}) #遍历列名 for r in df: print(r) #遍历列 for cName in df: print('df中的列 :\n', cName) print('df中的值 :\n', df[cName]) print("---------------------") #遍历行,方法一 for rIndex in df.index: print('现在是第 ', rIndex, ' 行') print(df.irow(rIndex)) #遍历行,方法二 for r in df.values: print(r) print(r[0]) print(r[1]) print("---------------------") #遍历行,方法三 for index, row in df.iterrows(): print('第 ', index, ' 行:') print(row) print("---------------------") # -*- coding: utf-8 -*-
Series(['John', 'Amy', 'Mark']), Series([True, False, True]) ]) DataFrame([[1, 2, 3], ['J', 'A', 'M'], [True, False, True]]) 'c' in frame2.index 'Washu' in frame2.columns 'Washu' in frame2 frame2.drop('d') frame2.drop('Washu', axis=1) frame2[frame2['Washu'] > 0] frame2 = frame2.fillna(0) frame2.ix[frame2.Washu > 0, frame2.ix['d'] > 0] frame2.xs('d') frame2.xs('UM', axis=1) frame2.icol(2) frame2.irow(4) frame2.add(frame3, fill_value=0) frame2.applymap(lambda x: '%.2f' % x) df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df.index.is_unique df.index.unique() df.ix['a'] frame2.describe() """ Unique values, value counts, membership Not unique indices """ obj = Series(list('cbdaabbcc')) obj.unique() s = obj.value_counts() pd.value_counts(obj.values, sort=False)
data.w # 选择表格中的'w'列,使用点属性,返回的是Series类型 data[['w']] # 选择表格中的'w'列,返回的是DataFrame属性 data[['w', 'z']] # 选择表格中的'w'、'z'列 data[0:2] # 返回第1行到第2行的所有行,前闭后开,包括前不包括后 data[1:2] # 返回第2行,从0计,返回的是单行,通过有前后值的索引形式, # 如果采用data[1]则报错 data.ix[1:2] # 返回第2行的第三种方法,返回的是DataFrame,跟data[1:2]同 data['a':'b'] # 利用index值进行切片,返回的是**前闭后闭**的DataFrame, # 即末端是包含的 data.irow(0) # 取data的第一行 data.icol(0) # 取data的第一列 data.head() # 返回data的前几行数据,默认为前五行,需要前十行则dta.head(10) data.tail() # 返回data的后几行数据,默认为后五行,需要后十行则data.tail(10) ser.iget_value(0) # 选取ser序列中的第一个 ser.iget_value(-1) # 选取ser序列中的最后一个,这种轴索引包含索引器的series不能采用ser[-1]去获取最后一个,这回引起歧义。 data.iloc[-1] # 选取DataFrame最后一行,返回的是Series data.iloc[-1:] # 选取DataFrame最后一行,返回的是DataFrame data.loc['a', ['w', 'x']] # 返回‘a’行'w'、'x'列,这种用于选取行索引列索引已知 data.iat[1, 1] # 选取第二行第二列,用于已知行、列位置的选取。
''' s.iget_value(1); s.iget_value(4); s.iget_value(0) # 9, 8, 4 frame=DataFrame( np.arange(6).reshape(3,2), # 3x2 matrix index=[2,0,1] ) frame ''' 0 1 2 0 1 0 2 3 1 4 5 ''' frame.irow(0) # get row 0 (index==2), get values of all its cols ''' 0 0 1 1 Name: 2, dtype: int64 ''' frame.irow(2) # get row 2 (index==1), get vals of all its cols ''' 0 4 1 5 Name: 1, dtype: int64 ''' frame.icol(1) # in this case, ==frame.icol(-1) '''
ser2[-1] ser.ix[:1] ser3=Series(range(3),index=[-5,1,3]) ser3.iget_value(2) ser3.iloc(2) ser3.iloc[i] ser3.iloc[2] ser3.iat[2] ser3 ser3.iloc[1] ser3.iloc[0] ser3.iloc[3] ser3.iraw(0) frame = DataFrame(np.arange(6).reshape(3,2)),index=[2,0,1]) frame = DataFrame(np.arange(6).reshape((3,2)),index=[2,0,1]) frame.irow(0) frame.iloc[0] frame frame.iloc[3] frame.iloc[2] pdata = pd.Panel(dict(stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL'])) pdata = pd.Panel(dict((stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL'])) from pandas_datareader import data pdata = pd.Panel(dict((stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL'])) pdata pdata = pdata.swapaxes('items','minor') pdata['Close'] pdata.ix[:,'6/1/2012',"] pdata.ix[:,'6/1/2012',:] pdata.ix['Close','5/22/2012':,:] stacked = pdata.ix['Close','5/22/2012':,:].to_frame()
print '字母索引可以直接使用-1来访问最后一个', ser2[-1] # 如果轴索引含有索引器, 那么根据整数进行数据选取操作,是面向标签的,不是面向排序的 print '3是面向标签的,不是面向位置的', ser_neg.ix[:3] # 可靠的,不考虑索引类型和基于位置的索引 # 使用Series的iget_value ser3 = Series(range(3), index=[-5, 1, 3]) # 这个future可能会被取消,尽量使用等价的那个 print ser3.iget_value(2) # 等价于 print ser3.iloc[2] # 对于frame可以使用irow frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1]) # 下面这个future可能会被取消,所以尽量使用等价的那一个 print frame.irow(0) print frame.iloc(0) # 面板数据 # pandas有一个Panel数据结构, 可以理解为一个三维版本的DataFrame pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL'])) print 'Panel', pdata # Panel的每一项都是DataFrame # 交换2个轴 pdata = pdata.swapaxes('items', 'minor') print pdata['Adj Close'] # 基于ix的标签索引被推广到三个维度 print pdata.ix[:, '6/1/2012', :] print pdata.ix['Adj Close', '5/22/2012':, :] # 呈现面板数据,尤其是面对拟合统计模型,使用"堆积式的"DataFrame形式
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
#获取索引 print ser2[-1] # 2.0 print ser2['c'] # 2.0 print ser2[2] # 2.0 #通过索引获取值 print ser.ix[:1] # 0 0 # 1 1 #series的iget_value只基于位置的索引 ser3 = Series(range(3), index=[-5, 1, 3]) print ser3.iget_value(2) # 2 # frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1]) print frame # 0 1 # 2 0 1 # 0 2 3 # 1 4 5 #获取第一行 print frame.irow(0) # 0 0 # 1 1 # Name: 2
{'a':range(0,10),'b':range(10,20),'c':range(20,30)} ) #%% d[0] # error #%% d['a'] # Series,列 #%% d[['a','c']] # DataFrame,列 #%% d[:5] # DataFrame,行 #%% d.ix[:5] # position-based,行 #%% d1.ix[:5] # label-based,行 #%% d.irow(0) # Series #%% d.icol(0) # Series #%% d.get_value('e','a') # get_value(row_name,col_name) #%% 强制使用位置来访问元素的方法 d.iget_value(0,1) # iget_value(irow,icol) #%% 使用条件过滤 d[d>5] #%% d[d.a>5] #%% d[(d>5)&(d%3==0)] #%% 使用条件过滤的本质
def main(): from pandas import DataFrame from vbench.api import BenchmarkRunner from vbench.db import BenchmarkDB from vbench.git import GitRepo from suite import REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies, benchmarks # GitRepo wants exactly 7 character hash? args.base_commit = args.base_commit[:7] if args.target_commit: args.target_commit = args.target_commit[:7] if not args.log_file: args.log_file = os.path.abspath( os.path.join(REPO_PATH, 'vb_suite.log')) random.seed(args.seed) np.random.seed(args.seed) TMP_DIR = tempfile.mkdtemp() prprint("TMP_DIR = %s" % TMP_DIR) prprint("LOG_FILE = %s\n" % args.log_file) benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)] try: logfile = open(args.log_file, 'w') prprint("Opening DB at '%s'...\n" % DB_PATH) db = BenchmarkDB(DB_PATH) prprint("Initializing Runner...") # all in a good cause... GitRepo._parse_commit_log = _parse_wrapper(args.base_commit) runner = BenchmarkRunner( benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, TMP_DIR, PREPARE, always_clean=True, # run_option='eod', start_date=START_DATE, module_dependencies=dependencies) repo = runner.repo # (steal the parsed git repo used by runner) # ARGH. reparse the repo, without discarding any commits, # then overwrite the previous parse results # prprint ("Slaughtering kittens..." ) (repo.shas, repo.messages, repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH, args.base_commit) h_head = args.target_commit or repo.shas[-1] h_baseline = args.base_commit prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, ""))) prprint('Baseline [%s] : %s\n' % (h_baseline, repo.messages.get(h_baseline, ""))) prprint("removing any previous measurements for the commits.") db.delete_rev_results(h_baseline) db.delete_rev_results(h_head) # TODO: we could skip this, but we need to make sure all # results are in the DB, which is a little tricky with # start dates and so on. prprint("Running benchmarks for baseline [%s]" % h_baseline) runner._run_and_write_results(h_baseline) prprint("Running benchmarks for target [%s]" % h_head) runner._run_and_write_results(h_head) prprint('Processing results...') head_res = get_results_df(db, h_head) baseline_res = get_results_df(db, h_baseline) ratio = head_res['timing'] / baseline_res['timing'] totals = DataFrame(dict(t_head=head_res['timing'], t_baseline=baseline_res['timing'], ratio=ratio, name=baseline_res.name), columns=["t_head", "t_baseline", "ratio", "name"]) totals = totals.ix[totals.t_head > args.min_duration] # ignore below threshold totals = totals.dropna( ).sort("ratio").set_index('name') # sort in ascending order hdr = ftr = """ ----------------------------------------------------------------------- Test name | target[ms] | base[ms] | ratio | ----------------------------------------------------------------------- """.strip() +"\n" s = "\n" s += hdr for i in range(len(totals)): t,b,r = totals.irow(i).values s += "{0:30s} {1: 12.4f} {2: 12.4f} {3: 12.4f}\n".format(totals.index[i],t,b,r) s+= ftr + "\n" s += "Ratio < 1.0 means the target commit is faster then the baseline.\n" s += "Seed used: %d\n\n" % args.seed s += 'Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, "")) s += 'Base [%s] : %s\n\n' % ( h_baseline, repo.messages.get(h_baseline, "")) logfile.write(s) logfile.close() prprint(s) prprint("Results were also written to the logfile at '%s'\n" % args.log_file) finally: # print("Disposing of TMP_DIR: %s" % TMP_DIR) shutil.rmtree(TMP_DIR) logfile.close()
ser2 = Series(np.arange(3.), index=['a','b','c']) print(ser2) print('\n') print(ser2[-1]) print('\n') print(ser.ix[:1]) print('\n') ser3 = Series(range(3), index=[-5,1,3]) print(ser3.iget_value(2)) print('\n') ############################################################### frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1]) print(frame.irow(0))