def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: #data - smoothed data to which seasonality will be added #if addition == True -> segment is added #if addition == False -> segment is subtracted len_smoothed_data = len(data) for idx, _ in enumerate(data): if idx - offset < 0: #TODO: add seasonality for non empty parts continue if (idx - offset) % seasonality == 0: if bound_type == Bound.UPPER: upper_segment_bound = self.get_segment_bound( segment, Bound.UPPER) data = data.add(pd.Series(upper_segment_bound.values, index=segment.index + idx), fill_value=0) elif bound_type == Bound.LOWER: lower_segment_bound = self.get_segment_bound( segment, Bound.LOWER) data = data.add(pd.Series(lower_segment_bound.values * -1, index=segment.index + idx), fill_value=0) else: raise ValueError(f'unknown bound type: {bound_type.value}') return data[:len_smoothed_data]
def simulate(self, p=None, tmin=None, tmax=None, freq=None, dt=1, istress=None): self.update_stress(tmin=tmin, tmax=tmax, freq=freq) h = Series(data=0, index=self.stress[0].series.index, name=self.name) stresses = self.get_stress(istress=istress) distances = self.get_distances(istress=istress) for stress, r in zip(stresses, distances): npoints = stress.index.size p_with_r = np.concatenate([p, np.asarray([r])]) b = self.get_block(p_with_r, dt, tmin, tmax) c = fftconvolve(stress, b, 'full')[:npoints] h = h.add(Series(c, index=stress.index, fastpath=True), fill_value=0.0) if istress is not None: if self.stress[istress].name is not None: h.name = self.stress[istress].name else: h.name = self.name + "_" + str(istress) else: h.name = self.name return h
def rate_of_return(period_ret: pd.Series, base_period: str) -> pd.Series: """ 跨期收益转换 假设 factor_data 对应的收益率列名为 period_30D, period_150D, period_450D, 如果以 period_30D 作为基准,假设 period_150D 的收益率为 r, 那么 period_150D 在收益率稳定 的情况下,理论上, period_30 从 period_150D 换算下来的收益率应该为 (1+r)^{30/150} - 1 参数 --- :param period_ret: 包含远期收益的数据,名称应该包括相应周期 :param base_period: 转换中使用的基准周期,譬如 ('1 days', '1D', '30m', '3h', '1D1h', etc) """ period_len = get_period(period_ret.name.replace("period_", "")) base_period = get_period(base_period.replace("period_", "")) pattern = re.compile(r"\d+") interval = pattern.findall(period_len)[0] base_interval = pattern.findall(base_period)[0] if (period_len.replace(interval, "") != "min") or (period_len.replace( interval, "") != "d"): if period_len.replace(interval, "") == "m": period_len = int(interval) * pd.Timedelta(days=DAYS_PER_MONTH) base_period = int(base_interval) * pd.Timedelta( days=DAYS_PER_MONTH) elif period_len.replace(interval, "") == "q": period_len = int(interval) * pd.Timedelta(days=DAYS_PER_QUARTER) base_period = int(base_interval) * pd.Timedelta( days=DAYS_PER_QUARTER) elif period_len.replace(interval, "") == "y": period_len = int(interval) * pd.Timedelta(days=DAYS_PER_YEAR) base_period = int(base_interval) * pd.Timedelta(days=DAYS_PER_YEAR) conversion_factor = pd.Timedelta(base_period) / pd.Timedelta(period_len) return period_ret.add(1).pow(conversion_factor).sub(1.0)
def test_fill_value_when_combine_const(self): # GH12723 s = Series([0, 1, np.nan, 3, 4, 5]) exp = s.fillna(0).add(2) res = s.add(2, fill_value=0) assert_series_equal(res, exp)
def chunkRead(): reader = pd.read_csv('data6.csv', sep=',', chunksize=1000) #print(reader.get_chunk(5)['key'].value_counts()) series = Series([]) for chunk in reader: series = series.add(chunk['key'].value_counts(), fill_value=0) print(series.sort_values(ascending=False)[:10])
def series_simple_math(ser: pd.Series, function: str, number: int) -> pd.core.series.Series: """Write some simple math helper functions for series. Take the given series, perfrom the required operation and return the new series. For example. Give the series: 0 0 1 1 2 2 dtype: int64 Function 'add' and 'number' 2 you should return 0 2 1 3 2 4 dtype: int64 :param ser: Series to perform operation on :param function: The operation to perform :param number: The number to apply the operation to """ if function == "add": return ser.add(number) elif function == "sub": return ser.sub(number) elif function == "mul": return ser.mul(number) elif function == "div": return ser.div(number)
def test_flex_add_scalar_fill_value(self): # GH12723 ser = Series([0, 1, np.nan, 3, 4, 5]) exp = ser.fillna(0).add(2) res = ser.add(2, fill_value=0) tm.assert_series_equal(res, exp)
def get_sum(docs, df, length): sum = Series([]) if length > 0: for doc in docs: sum = sum.add(df.loc[doc + ".xml"], fill_value=0) return sum else: return [0] * length
def _holding_ret(self, ret: pd.Series) -> pd.Series: """ 计算持有不同周期的股票收益率 :param ret: 股票收益率序列 :return: """ # Holding period return ret = ret.add(1) ret_label = 1 for shift_ in range(self.hp): ret_label *= ret.groupby(KN.STOCK_ID.value).shift(-shift_) ret_label = ret_label.sub(1) return ret_label
def analysize(self): """ 分析每个不同因素中,各类群体平均总的消费金额. """ attributeSeries = Series([]) attributeDict = dict() for piece in self.chunks: attributeSeries = attributeSeries.add(piece.groupby(self.attribute).amount.sum(), fill_value=0.0) #注意以下语句的使用方式.经过民族分组之后的学号可能不一定是唯一的.因此还需要通过nunique即返回唯一的大小. #attributeCount = attributeCount.add(piece.groupby(self.attribute).studentID.nunique(), fill_value=0.0) for attribute,idArray in piece.groupby(self.attribute).studentID.unique().iteritems(): attributeDict.setdefault(attribute, np.array([])) attributeDict[attribute] = np.union1d(attributeDict[attribute], idArray) return (attributeSeries/(Series(attributeDict).apply(lambda x : len(x)))).sort_values()
def _get_view_target_weights(self, view: View, market_weights: pd.Series, market_covariance: pd.DataFrame, view_matrix: pd.DataFrame, view_out_performance: pd.Series) -> pd.Series: """ get target weights based on the view allocation and stated confidence in the view """ zero_view_cov = pd.DataFrame([0], index=[view.id], columns=[view.id]) full_confidence_weights = self._get_weights(market_weights, market_covariance, view_matrix, zero_view_cov, view_out_performance) max_weight_difference = full_confidence_weights - market_weights target_weights = market_weights.add(view.confidence * max_weight_difference) return target_weights
def predictions_better(ts_data, window_size, should_plot=True): results_ARIMA = model_combined_no_log(ts_data, window_size, True) # Make a series with cumulative fitted values predictions_ARIMA_diff = Series(results_ARIMA.fittedvalues, copy=True) predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum() # Make a series with combined original and cumulative fitted values predictions_ARIMA = Series(ts_data.ix[0], index=ts_data.index) # predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff, fill_value=0) predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff_cumsum, fill_value=0) if should_plot: pyplot.figure(2) pyplot.plot(ts_data, color="blue", label="Original") pyplot.plot(predictions_ARIMA, color="green", label="Prediction") pyplot.legend(loc="best") pyplot.title("Predictions") pyplot.show(block=False)
def test_timedelta_arithmetic(self): data = Series(["nat", "32 days"], dtype="timedelta64[ns]") deltas = [timedelta(days=1), Timedelta(1, unit="D")] for delta in deltas: result_method = data.add(delta) result_operator = data + delta expected = Series(["nat", "33 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) result_method = data.sub(delta) result_operator = data - delta expected = Series(["nat", "31 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) # GH 9396 result_method = data.div(delta) result_operator = data / delta expected = Series([np.nan, 32.0], dtype="float64") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected)
def main(tweets_filepath, count_filepath, median_filepath): ''' Function to read the file and construct word counter ''' f_read = open(tweets_filepath, 'r') wordcount_series = Series() stream_of_median = [] first_half_max_heap = [] second_half_min_heap = [] first_element_median_flag = False for each in f_read: word_counter = Counter(each.lower().rstrip().split(' ')) word_series = Series(word_counter) wordcount_series = wordcount_series.add(word_series, fill_value=0) if first_element_median_flag: curr_median = median_unique.running_median(first_half_max_heap, second_half_min_heap, float(len(word_counter.keys()))) stream_of_median.append(curr_median) else: first_element_median_flag = True curr_median = len(word_counter.keys()) second_half_min_heap.append(curr_median) stream_of_median.append(curr_median) f_read.close() write_count_file(wordcount_series, count_filepath) write_median_file(stream_of_median, median_filepath)
index = ['a', 'b', 'c']) y = DataFrame(numpy.arange(12).reshape((4, 3)), columns = ['A','B','C'], index = ['a', 'b', 'c', 'd']) print x print y print x + y ''' A B C a 0.0 2.0 4.0 b 6.0 8.0 10.0 c 12.0 14.0 16.0 d NaN NaN NaN ''' print '对x/y的不重叠部分填充,不是对结果NaN填充' print x.add(y, fill_value = 0) # x不变化 ''' A B C a 0.0 2.0 4.0 b 6.0 8.0 10.0 c 12.0 14.0 16.0 d 9.0 10.0 11.0 ''' print 'DataFrame与Series运算:行运算' frame = DataFrame(numpy.arange(9).reshape((3, 3)), columns = ['A','B','C'], index = ['a', 'b', 'c']) series = frame.ix[0] print frame
df=pd.DataFrame(data,index=index,columns=columns)#生成一个数据框 df.ix[:,0:2] df.ix[:,[0,2]] iris.query('Species == "setosa"') dt.query('sl >5 & pw >2') print(dt.query('sl >5')) ############################ panda from pandas import Series, DataFrame import pandas as pd arr=[1,2,3,4] series_1 = Series(arr) series_2=Series([1,2,3,4]) series_3=Series([1,2,'3',4,'a']) series_4 =Series([1,2,3]) series_4.index=['a','b','c'] #创建索引 temp =Series([5]) type(temp) series_4.append(temp) #增 Series的add()方法是加法计算不是增加Series元素用的。 series_4.add(temp) #对应索引位置的相加 series_4.drop('a') # 删 series_4['a']=4 #改 series_4['a'] #查
def practice_two(): # 重新索引 reindex obj = Series(['b', 'p', 'y'], index=[0, 2, 4]) obj.reindex(range(6), method='ffill') ''' ffill 前向填充值 bfill 后向填充值 pad 前向搬运值 backfill 后向搬运值 ''' frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) # 3行3列的数组,行索引为index,列索引为columns frame2 = frame.reindex(['a', 'b', 'c', 'd']) # 添加索引为b这一行 states = ['Texas', 'Utah', 'California'] frame.reindex(columns=states) # 使用columns可重新索引列 ''' reindex函数的参数 index 用作索引的新序列 method 插值方式 fill_value 重新索引的过程中,需要引入缺失值时使用的代替值 limit 前向或后向填充时的最大填充量 level 在Multilndex的指定级别上匹配简单索引,否则取其子集 copy 默认True,无论如何都复制;若为False,则新旧相等不复制 ''' # 丢弃指定轴上的项 obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj.drop('c') # 删除c行 obj.drop(['d', 'c']) # 删除d,c行 data = DataFrame(np.arange(16).reshape((4, 4)), index=['o', 'c', 'u', 'n'], columns=['one', 'two', 'three', 'four']) data.drop(['two', 'four'], axis=1) # 删除列,two,four # 索引,选取,过滤 obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd']) obj['b'] # 等价于obj[1] obj[2:4] obj[['b', 'a', 'd']] obj[[1, 3]] obj[obj < 2] obj['b':'c'] obj['b':'c'] = 5 # 修改值 ''' DataFrame的索引选项 obj[val] 选取单列或一组列 obj.ix[val] 单行或一组行 obj.ix[val1, val2] 同时选取行和列 reindex方法 将一个或多个轴匹配到新索引 xs方法 根据标签选取单行或单列,返回Series icol,irow方法 根据整数位置选取单列或单行,返回Series get_value,set_value方法 根据行标签和列标签选取单个值 ''' # 算术运算和数据对齐 s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'd', 'e', 'f', 'g']) s1 + s2 # 在不重叠的索引处引入NA值 # 同样会发生在DataFrame上 s1.add(s2, fill_value=0) # 不会出现NA值,单纯加 s1.reindex(columns=s2.columns, fill_value=0) # 指定值 ''' add + sub - div / mul * ''' frame = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('bde'), index=['U', 'O', 'T', 'R']) series = frame.ix[0] frame - series series2 = Series(range(3), index=['b', 'e', 'f']) frame + series2 # 会出现NA值 series3 = frame['d'] frame.sub(series3, axis=0) # 函数应用与映射 frame = DataFrame(np.random.randn(3, 4), columns=list('bde'), index=['U', 'O', 'T', 'R']) np.abs(frame) # 绝对值 f = lambda x: x.max() - x.min() frame.apply(f) frame.apply(f, axis=1) format = lambda x: '%.2f' % x frame.applymap(format) frame['e'].map(format) # 排序和排名 ''' .sort_index() 按字典顺序排序 行 .sort_index(axis=1) 列 .sort_index(ascending=False) 降序,默认升 .order() 对Series .sort_index(by='*') 针对*列 .rank(ascending=False,method='first',axis=1) # 'average' 默认,平均 'min' 最小 'max' 最大 'first' 按值在原始出现顺序分配排名 ''' # 带有重复值的轴索引 obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj.index.is_unique # 值是否唯一 pass
''' data = pd.read_csv('data/ex4.csv', nrows=2, skiprows=[0, 2]) print(data) # value1 value2 key1 key2 # 0 one a 0 1 # 1 one b 2 3 # 将数据分成块 chunker = pd.read_csv('data/ex3.csv', chunksize=2) print( chunker) # <pandas.io.parsers.TextFileReader object at 0x000000000B1B9F60> tot = Series([]) for piece in chunker: tot = tot.add(piece['value1'].value_counts(), fill_value=0) print(tot) # one 5.0 # two 3.0 # dtype: float64 print(tot[0]) ''' 输出文本格式,自定义分隔符 ''' data = pd.read_csv('data/ex3.csv') print(data) import sys data.to_csv('data/ex6.csv', sep='|')
tf.ix[fila,word] = 1 else: tf.ix[fila,word] = tf.ix[fila,word] + 1 tf.ix[fila] = tf.ix[fila] / len(tokens) fila = fila + 1 print "Fila: ", fila #print tf print "TF MATRIX LISTO" idf = Series() #print idf.index for term in termslist.keys(): apariciones = termslist[term] totaldoc = data.shape[0] argumento = totaldoc / (1 + apariciones) #print argumento test = Series({term : math.log(argumento)}) idf = idf.add(test, fill_value=0) #print idf print "IDF LISTO" gc.collect() for i, row in tf.iterrows(): print i tf.ix[i] = row.multiply(idf) #gc.collect() #print tf #print idf #tfidf = tf.apply(lambda x: x.multiply(idf), axis = 1) #print tfidf tf.to_csv('tfidf.csv') idf.to_csv('idf.csv')
def main(): out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) df = pd.read_csv(ex1_path) p(df) p(pd.read_table(ex1_path, sep=',')) p('header less---------------------') ex2_path = study.DATA_DIR + '/ch06/ex2.csv' cat(ex2_path) names = ['a','b', 'c', 'd', 'message'] p(pd.read_csv(ex2_path, header=None)) p(pd.read_csv(ex2_path, names=names)) p(pd.read_csv(ex2_path, names=names, index_col='message')) p('hierarchy index---------------------') mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv' cat(mindex_path) p(pd.read_csv(mindex_path, index_col=['key1', 'key2'])) p('separate by regex-------------') ex3_path = study.DATA_DIR + '/ch06/ex3.csv' cat(ex3_path) p(pd.read_csv(ex3_path, sep='\s+')) p('skip rows-----------') ex4_path = study.DATA_DIR + '/ch06/ex4.csv' cat(ex4_path) p(pd.read_csv(ex4_path, skiprows=[0,2,3])) p('N/A------------------') ex5_path = study.DATA_DIR + '/ch06/ex5.csv' cat(ex5_path) result = pd.read_csv(ex5_path) p(result) p(pd.isnull(result)) result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA p(result) p('N/A dict------------------') sentinels = {'message': ['foo', 'NA'], 'something': ['two']} p(sentinels) p(pd.read_csv(ex5_path, na_values=sentinels)) p('6.1.1 read data chunk size---------------------') ex6_path = study.DATA_DIR + '/ch06/ex6.csv' p(pd.read_csv(ex6_path).count()) p(pd.read_csv(ex6_path, nrows=5)) chunker = pd.read_csv(ex6_path, chunksize=1000) p(chunker) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot.order(ascending=False) p(tot[:10]) p('6.1.2 write---------------------') data = pd.read_csv(ex5_path) p(data) ex5_out_path = out_dir + '/ex5_out.csv' data.to_csv(ex5_out_path) cat(ex5_path) data.to_csv(sys.stdout, index=False, header=False) print '' data.to_csv(sys.stdout, index=False, cols=list('abc')) print '' p('Series--------------') tseries_out_path = out_dir + '/tseries_out.csv' dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv(tseries_out_path) cat(tseries_out_path) p(Series.from_csv(tseries_out_path, parse_dates=True)) p('6.1.3 csv-------------------------') ex7_path = study.DATA_DIR + '/ch06/ex7.csv' cat(ex7_path) f = open(ex7_path) reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open(ex7_path))) header, values = lines[0], lines[1:] data_dict = {h: v for h,v in zip(header, zip(*values))} p(data_dict) my_data_out_path = out_dir + '/mydata.csv' with open(my_data_out_path, 'w') as fp: writer = csv.writer(fp, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) cat(my_data_out_path) p('6.1.4 JSON-------------------------') obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ result = json.loads(obj) p(result) asjson = json.dumps(result) p(asjson) siblings = DataFrame(result['siblings'], columns=['name', 'age']) p(siblings) p('6.1.4 XML/HTML Web Scraping-------------------------') url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options' if not url is '': parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() p([lnk.get('href') for lnk in doc.findall('.//a')][-10:]) tables = doc.findall('.//table') p(parse_options_data(tables[9])[:5]) p(parse_options_data(tables[13])[:5]) p('6.1.5 Read XML-------------------------') xml_path = out_dir + '/Performance_MNR.xml' xml_content =""" <INDICATOR> <INDICATOR_SEQ>373889</INDICATOR_SEQ> <PARENT_SEQ></PARENT_SEQ> <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME> <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME> <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION> <PERIOD_YEAR>2011</PERIOD_YEAR> <PERIOD_MONTH>12</PERIOD_MONTH> <CATEGORY>Service Indicators</CATEGORY> <FREQUENCY>M</FREQUENCY> <DESIRED_CHANGE>U</DESIRED_CHANGE> <INDICATOR_UNIT>%</INDICATOR_UNIT> <DECIMAL_PLACES>1</DECIMAL_PLACES> <YTD_TARGET>97.00</YTD_TARGET> <YTD_ACTUAL></YTD_ACTUAL> <MONTHLY_TARGET>97.00</MONTHLY_TARGET> <MONTHLY_ACTUAL></MONTHLY_ACTUAL> </INDICATOR> """ if not os.path.exists(xml_path): with open(xml_path, 'w') as f: f.write(xml_content) parsed = objectify.parse(open(xml_path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_SEQ', 'DECIMAL_PLACES'] p(dir(root)) for elt in root: # .INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) perf = DataFrame(data) p(perf) tag = '<a href="http://google.com">Google</a>' root = objectify.parse(StringIO.StringIO(tag)).getroot() p(root) p(root.get('href')) p(root.text)
n_file = len(idxs_file) n_process = 12 arg_lists = list(zip(idxs_file, [path] * n_file, [True] * n_file)) # Parallel run start_time = time.time() with Pool(processes = n_process) as pool: results = pool.map(analyze_file, arg_lists) # Reduce the results count_fare = Series() mat_reg1_XX_XY = np.zeros((2, 3)) mat_reg2_XX_XY = np.zeros((3, 4)) for result in results: count_fare = count_fare.add(result[0], fill_value = 0) mat_reg1_XX_XY += result[1] mat_reg2_XX_XY += result[2] # Compute the deciles cdf = np.cumsum(count_fare) / np.sum(count_fare) deciles = [cdf[cdf >= p].index[0] for p in np.arange(0, 1.05, 0.1)] # Solve the regressions coeff1 = np.linalg.solve(mat_reg1_XX_XY[:, 0:2], mat_reg1_XX_XY[:, 2]) coeff2 = np.linalg.solve(mat_reg2_XX_XY[:, 0:3], mat_reg2_XX_XY[:, 3]) print("Deciles of the total amount less toll:") print(deciles) print("Linear model of the total amount less the tolls versus trip time:") print(coeff1)
student ■ series끼리 연산하기 obj1 = Series([10,5,3,7],index=['a','b','c','d']) obj2 = Series([2,4,6,8,10], index=['a','b','c','d','e']) #series 사칙연산 obj1 *100 #더하기 #series 끼리 연산하기 - 인덱스 이름을 기준으로 연산한다. obj1 + obj2 #series 끼리 연살 할 때, 인덱스가 없는 것은 0으로 계산 작업을 하기 obj1.add(obj2, fill_value=0) # 빼기 obj1-obj2 obj1.sub(obj2, fill_value=0) # 곱하기 obj1*obj2 obj.mul(obj,fill_value=1) #나누기 obj1/obj2 obj1.div(obj, fill_value=1) ■ dataframe 사칙연산: 인덱스를 기준으로 사칙연산 df1 = DataFrame(np.arange(6).reshape(2,3),
x print y print x + y ''' A B C a 0.0 2.0 4.0 b 6.0 8.0 10.0 c 12.0 14.0 16.0 d NaN NaN NaN ''' print '对x/y的不重叠部分填充,不是对结果NaN填充' print x.add(y, fill_value=0) # x不变化 ''' A B C a 0.0 2.0 4.0 b 6.0 8.0 10.0 c 12.0 14.0 16.0 d 9.0 10.0 11.0 ''' print 'DataFrame与Series运算:行运算' frame = DataFrame(numpy.arange(9).reshape((3, 3)), columns=['A', 'B', 'C'], index=['a', 'b', 'c']) series = frame.ix[0]
#use dict to create Series sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} obj3 = Series(sdata) obj3 states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index=states) obj4 pd.isnull(obj4) pd.notnull(obj4) #auto align obj3 + obj4 obj3.add(obj4, fill_value = 0) #DataFrame===================================================================== data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} frame = DataFrame(data) #can set column and index frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) #selecting by columns, return a Series, so you can index the Series again frame['state'] frame[['year','state']] frame[[0]] frame[[0,2,1]] #selecting by rows or index
# u'New Lilianland' u'Iowa' 76517L 91000L 120000L 35000L]] print("\n") print(matrix[:,-3:]) print("\n") print(matrix[:,-3:].sum(axis=1)) print("\n=======================================================") print("Sries_operation") print("=======================================================") s1 = Series(range(1,6), index=list("abced")) print(s1) print("\n") s2 = Series(range(5,11), index=list("bcedef")) print(s2) print("\n") print(s1.add(s2)) print("\n") print(s1+s2) # 위와 같은 결과 # index 기준으로 연산 수행 # 겹치는 index가 없을경우 NaN으로 반환 print("\n=======================================================") print("Dataframe_operation") print("=======================================================") df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc")) print(df1) print("\n") df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd")) print(df2) print("\n") print(df1+df2)
s1 = s + 10 # series的每一个元素均加上1 print(s1) s2 = Series(data=np.random.randint(0, 100, size=5)) print(s2) print(s) print(s + s2) # 按照每个元素对应的索引进行相加,索引对不上的结果直接返回nan! # 4.2 索引对不上的计算 s1 = Series(np.random.randint(0, 150, size=4), index=["A", "B", "C", "Sara"], name="数学") s2 = Series(data=np.random.randint(0, 150, size=5), index=["张三", "李四", "Sara", "Lisa", "Machel"]) print(s1 + s2) s1.add(s2) s1.add(s2, fill_value=0) # 将nan自动填充为0再进行加法运算 # 4.3 其他加减乘除方法 s.add(20) s.subtract(20) s.multiply(2) s.divide(2) # 4.4要想保留所有的index,则需要使用.add()函数 np.full((2, 5), fill_value=10) s.add(s2, fill_value=0) # 这样后5个数据就自动填上0了,不会返回nan! # -*- coding: utf-8 -*- """ 5.傅里叶案例
# print b # print c # print d # print e # print g # print h # print i # 获取行数 len(f.index) # 运算 DataFrame同理 s1 = Series(np.arange(10,20),index=np.arange(0,10)) s2 = Series(np.arange(50,60),index=np.arange(5,15)) s3 = s1 + s2 # 对原本没有的值进行填充 s4 = s1.add(s2,fill_value = 0) # print s3 # print s4 # Series 与 DataFrame的运算 s1 = f.ix[0,:] # DataFrame会每行都根据索引减去series中相应的值 f1 = f - s1 # print f1 # 若需要按列运算,需指定axis轴 s1 = f.ix[:,0] f1 = f.add(s1,axis = 0) print f1
def get_monthly_return1(date_index): this_month = d4[d4.date == dates.loc[date_index]] next_month = d4[d4.date == dates.loc[date_index + 1]] temp = pd.merge(next_month, this_month, how='inner', left_on='PERMNO', right_on='PERMNO', suffixes=('_n', '_t')) tickers = get_tickers1(temp) return get_value_weighted_return(temp, tickers) dates = Series(d4.date.unique()) small_mon = Series(dates.index[:-1]).map(get_monthly_return1) small = np.cumprod(small_mon.add(1)) """## 2. Top 35% B/M ratio""" def get_tickers2(df): number = int(round(len(df.permno.drop_duplicates()) * 0.35)) bm = df.bm.sort_values(ascending=False)[:number] tickers = df.permno.loc[bm.index] return tickers def get_monthly_return2(date_index): this_month = d4[d4.date == dates.loc[date_index]] next_month = d4[d4.date == dates.loc[date_index + 1]] bm = d1[d1.public_date == dates.loc[date_index]] temp = pd.merge(next_month,
#!/usr/bin/env python # encoding=utf-8 import pandas as pd import numpy as np from pandas import Series, DataFrame # 逐块读取文件 # 在处理大文件时,或找出大文件中的参数集以便后续处理时,可以读取文件的一小部分或逐块对文件进行迭代 result = pd.read_csv('ex6.csv') # 可以指定读取其中的几行,通过nrows来指定,即读取前几行 result_part = pd.read_csv('ex6.csv', nrows=5) print result_part # 可以读取逐块读取文件,需要设置chunksize(行数) chunker = pd.read_csv('ex6.csv', chunksize=1000) # chunker是一个TextFileReader print chunker tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) # 降序,order修改为sort_values tot = tot.sort_values(ascending=False) print tot print tot[:10] # print chunker.get_chunk(500)
# 算术运算和数据对齐 x = DataFrame(n.arange(9.).reshape((3, 3)), index=['a', 'b', 'c'], columns=['A', 'B', 'C']) y = DataFrame(n.arange(12).reshape((4, 3)), index=['a', 'b', 'c', 'd'], columns=['A', 'B', 'C']) x + y # result 不重叠部分为NaN,重叠部分元素运算 # A B C # a 0.0 2.0 4.0 # b 6.0 8.0 10.0 # c 12.0 14.0 16.0 # d NaN NaN NaN x.add(y, fill_value=0) # result 对x/y的不重叠部分填充,不是对结果NaN填充 # A B C # a 0.0 2.0 4.0 # b 6.0 8.0 10.0 # c 12.0 14.0 16.0 # d 9.0 10.0 11.0 frame = DataFrame(n.arange(9).reshape((3, 3)), index=['a', 'b', 'c'], columns=['A', 'B', 'C']) series = frame.ix[0] frame - series # result 默认按行运算 # A B C
# 4.算术运算和数据对齐 if __name__ == '__main__': print('DataFrame 算术:不重叠部分为NaN,重叠部分元素运算:') x = DataFrame(numpy.arange(9.).reshape((3,3)), columns = ['A','B','C'], index = ['a','b','c']) y = DataFrame(numpy.arange(12).reshape((4,3)), columns = ['A','B','C'], index = ['a','b','c','d']) print(x) print(y) print(x + y) print('对x/y的不重叠部分填充,不是对结果NaN填充:') print(x.add(y,fill_value = 0)) # x不变化 print('DataFrame 与 Series 运算:每行/每列进行运算:') frame = DataFrame(numpy.arange(9).reshape((3,3)), columns = ['A','B','C'], index = ['a','b','c']) series = frame.ix[0] # frame 的第一行 print(frame) print(series) print(frame - series) # frame 的每行减 series series2 = Series(range(4),index = ['A','B','C','D']) print(frame + series2) # 按行运算,缺失列则为 NaN series3 = frame.A # frame 的第一列 print(frame.sub(series3,axis = 0)) # 按列运算
class MySeries: def __init__(self, *args, **kwargs): self.x = Series(*args, **kwargs) self.values = self.x.values self.index = self.x.index def rolling_mean(self, *args, **kwargs): return MySeries(pd.rolling_mean(self.x, *args, **kwargs)) def rolling_count(self, *args, **kwargs): return MySeries(pd.rolling_count(self.x, *args, **kwargs)) def rolling_sum(self, *args, **kwargs): return MySeries(pd.rolling_sum(self.x, *args, **kwargs)) def rolling_median(self, *args, **kwargs): return MySeries(pd.rolling_median(self.x, *args, **kwargs)) def rolling_min(self, *args, **kwargs): return MySeries(pd.rolling_min(self.x, *args, **kwargs)) def rolling_max(self, *args, **kwargs): return MySeries(pd.rolling_max(self.x, *args, **kwargs)) def rolling_std(self, *args, **kwargs): return MySeries(pd.rolling_std(self.x, *args, **kwargs)) def rolling_var(self, *args, **kwargs): return MySeries(pd.rolling_var(self.x, *args, **kwargs)) def rolling_skew(self, *args, **kwargs): return MySeries(pd.rolling_skew(self.x, *args, **kwargs)) def rolling_kurtosis(self, *args, **kwargs): return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs)) def rolling_window(self, *args, **kwargs): return MySeries(pd.rolling_window(self.x, *args, **kwargs)) def cumprod(self, *args, **kwargs): return MySeries(self.x.cumprod(*args, **kwargs)) def cumsum(self, *args, **kwargs): return MySeries(self.x.cumsum(*args, **kwargs)) def diff(self, *args, **kwargs): return MySeries(self.x.diff(*args, **kwargs)) def div(self, *args, **kwargs): return MySeries(self.x.div(*args, **kwargs)) def mul(self, *args, **kwargs): return MySeries(self.x.mul(*args, **kwargs)) def add(self, *args, **kwargs): return MySeries(self.x.add(*args, **kwargs)) def dropna(self, *args, **kwargs): return MySeries(self.x.dropna(*args, **kwargs)) def fillna(self, *args, **kwargs): return MySeries(self.x.fillna(*args, **kwargs)) def floordiv(self, *args, **kwargs): return MySeries(self.x.floordiv(*args, **kwargs)) def mod(self, *args, **kwargs): return MySeries(self.x.mod(*args, **kwargs)) def nlargest(self, *args, **kwargs): return MySeries(self.x.nlargest(*args, **kwargs)) def nonzero(self, *args, **kwargs): return MySeries(self.x.nonzero(*args, **kwargs)) def nsmallest(self, *args, **kwargs): return MySeries(self.x.nsmallest(*args, **kwargs)) def pow(self, *args, **kwargs): return MySeries(self.x.pow(*args, **kwargs)) def rank(self, *args, **kwargs): return MySeries(self.x.rank(*args, **kwargs)) def round(self, *args, **kwargs): return MySeries(self.x.round(*args, **kwargs)) def shift(self, *args, **kwargs): return MySeries(self.x.shift(*args, **kwargs)) def sub(self, *args, **kwargs): return MySeries(self.x.sub(*args, **kwargs)) def abs(self, *args, **kwargs): return MySeries(self.x.abs(*args, **kwargs)) def clip(self, *args, **kwargs): return MySeries(self.x.clip(*args, **kwargs)) def clip_lower(self, *args, **kwargs): return MySeries(self.x.clip_lower(*args, **kwargs)) def clip_upper(self, *args, **kwargs): return MySeries(self.x.clip_upper(*args, **kwargs)) def interpolate(self, *args, **kwargs): return MySeries(self.x.interpolate(*args, **kwargs)) def resample(self, *args, **kwargs): return MySeries(self.x.resample(*args, **kwargs)) def replace(self, *args, **kwargs): return MySeries(self.x.replace(*args, **kwargs))
# 逐块读取,需要设置chunksize(行数) # 还可以使用get_chunk方法,可以使你读取任意大小的块 chunker = pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv', chunksize=1000) chunker # In[25]: # 可以对此对象进行迭代处理 tot = Series([]) # In[26]: for piece in chunker: tot.add(piece['key'].value_counts(), fill_value=0) tot # In[27]: # 将数据写入到文本格式 data = pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv') data # In[46]: # 使用DataFrame的to_csv方法,可以将数据写到一个以逗号分隔的文件中 data.at[0, 'something'] = 'yuki' data.to_csv('pydata-book-2nd-edition/examples/ex6_out.csv') # In[29]:
print(s1) # a -2.1 # c 3.6 # e -1.5 # f 4.0 # g 3.1 # dtype: float64 print(s2) # 两者相加,两边都存在则值相加,至少有一方不存在,则为NaN # a 5.2 # c 1.1 # d NaN # e 0.0 # f NaN # g NaN # dtype: float64 print(s1 + s2) # 如果需要对不重叠的位置使用填充,使用add方法中的fill_value参数实现 # 这种方式会以前者为主 # a 5.2 # c 1.1 # d 3.4 # e 0.0 # f 4.0 # g 3.1 # dtype: float64 print(s1.add(s2, fill_value=0))
print(df.loc[:'2월', ['서초']]) print(df.loc[:'2월', ['서초', '강남']]) print('\niloc') print(df.iloc[2]) print(df.iloc[2, :]) # 2행의 모든 열 print(df.iloc[:3, 2]) print(df.iloc[:3, 1:3]) # 1열 부터 3열 미만 print('\n\n산술연산------------') s1 = Series([1, 2, 3], index=['a', 'b', 'c']) s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c']) print(s1) print(s2) print(s1 + s2) print(s1.add(s2)) # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다) print() df1 = DataFrame(np.arange(9.).reshape(3, 3), columns=list('kbs'), index=['서울', '인천', '수원']) print(df1) df2 = DataFrame(np.arange(12.).reshape(4, 3), columns=list('kbs'), index=['서울', '인천', '일산', '수원']) print(df2) print() print(df1 + df2) # 얘는 속성을 쓸 수 없다. print(df1.add(df2)) # 얘는 속성을 쓸 수 있다. print(df1.add(df2, fill_value=0)) # 얘는 속성(ex. fill_value)를 쓸 수 있다.
sentinels = {'message': ['foo', 'NA'], 'something': ['two']} pd.read_csv('ex5.csv', na_values=sentinels) ### reading text files in pieces result = pd.read_csv('ex6.csv') result pd.read_csv('ex6.csv', nrows=5) chunker = pd.read_csv('ex6.csv', chunksize=10) chunker tot = Series([]) for chunk in chunker: tot = tot.add(chunk['key'].value_counts(), fill_value=0) tot ### Writing out data -- just like read in no examples ### stop for today ### JSON obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """
def practice_one(): obj = Series([4, 7, -5, 3]) ''' pandas解析函数 read_csv 从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为逗号 read_table 从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为制表符 read_fwf 读取定宽列格式数据(没有分隔符) read_clipboard 读取剪贴板中的数据,可以看作read_table的剪贴板 ''' ''' read_csv/read_table函数的参数: path 表示文件系统位置,URL,文件型对象的字符串 sep,delimiter 用于对行中个字段进行拆分的字符序列或正则表达式 header 用作列名的行号。默认0(第一行),若无则设置为None index_col 用作行索引的列编号或列名 names 用于结果的列名列表 skiprows 需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始) na_values 一组用于替换NA的值 comment 用于将注释信息从行尾拆分出去的字符(一或多) parse_dates 将数据解析为日期,默认False;若为True,则尝试解析所有列。此外,还可以指定需要的一组列号或列名 keep_data_col 如果连接多列解析日期,则保持参与连接的列。默认False converters 由列名/列名跟函数之间的映射关系组成的字典 dayfirst 当解析有歧义的日期时,将其看做国际格式 data_parser 用于解析日期的函数 nrows 需要读取的行数 iterator 返回一个TextParser以便逐块读取文件 chunksize 文件块的大小(用于迭代) skip_footer 需要忽略的行数 verbose 打印各种解析器输出信息 encoding 用于unicode的文件编码格式 squeeze 如果数据经解析后仅含一列,则返回Series thousands 千分位分隔符,如‘,’或‘。’ ''' # 逐块读取文本文件 ''' 文件夹:ch06 文件名:ex6.csv ''' # 在处理文件时若只想读取一小部分或对文件进行迭代 pd.read_csv('ch06/ex6.csv') # 只想读取几行,通过nrows进行指定即可 pd.read_csv('ch06/ex6.csv', nrows=5) # 逐块读取文件,需要设置chunksize(行数), 返回TextParser对象 chunker = pd.read_csv('ch06/ex6.csv', chunksize=10) tot = Series([]) for piece in chunker: tot = tot.add(piece['message'].value_counts, fill_value=0) # 聚合到message列 tot = tot.order(ascending=False) # 将数据写出到文本格式 data = pd.read_csv('ch06/ex5csv') data.to_csv('ch06/out.csv') # 将数据写入一个以逗号分隔的文件中 data.to_csv(sys.stdout, sep='|') # 分隔符为| data.to_csv(sys.stdout, na_rep='NULL') # 缺失值表示为空字符串 data.to_csv(sys.stdout, index=False, header=False) data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c']) # 手工处理分隔符格式 import csv f = open('ch06/ex7.csv') reader = csv.reader(f) for line in reader: print(line) lines = list(csv.reader(open('ch06/ex7.csv'))) header, values = lines[0], lines[1:] # 分段 data_dict = {h: v for h, v in zip(header, zip(*values))} # 定义csv.Dialect的一个子类,关于格式的 class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' reader = csv.reader(f, dialect=my_dialect) reader = csv.reader(f, dialect='|') # 不定义子类,直接提供 ''' csv.Dialect的属性及功能 delimiter 用于分隔字段的单字符字符串,默认',' lineterminator 用于写操作的行结束,默认'\r\n' quotechar 用于带有特殊字符的字段的引用符号,默认'"' quoting 引用约定。可选值包括csv.QUOTE_ALL(引用所有字段), csv.QUOTE_MINIMAL(只引用带有如分隔符之类特殊字符的字段), csv.QUOTE_NONNUMERIC以及csv.QUOTE_NON(不引用),默认QUOTE_MINIMAL skipinitialspace 忽略分隔符后面的空白符,默认False doublequote 处理字段内的引用符号。True,则双写 escapechar 用于对分隔符进行转义的字符串,默认禁用 ''' with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) # JSON数据 obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ import json result = json.loads(obj) # 将JSON对象转换为python格式 json.dumps(result) # 将python对象转换为JSON格式 siblings = DataFrame(result['siblings'], columns=['name', 'age']) # 将JSON对象转换为DataFrame # XML和HTML:Web信息收集 from lxml.html import parse from urllib2 import urlopen # 无法下载urllib2类 parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() links = doc.findall('.//a') # 查询 links[28].get('href') # 获得url links[28].text_content() # 获得文本 urls = [links[28].get('href') for lnk in doc.findall('.//a')] # 获得文档中全部URL tables = doc.findall('.//table') calls = tables[9] puts = tables[13] rows = calls.findall('.//tr') def _unpack(row, kind='td'): elts = row.findall('.//%s' % kind) return [val.text_content() for val in elts] _unpack(rows[1], kind='th') _unpack(rows[1], kind='td') from pandas.io.parsers import TextParser def parse_options_data(table): rows = table.findall('.//tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] return TextParser(data, names=header).get_chunk() parse_options_data(calls) parse_options_data(puts) pass
def _series_add(previous_result: pd.Series, new_result: pd.Series): """Reducing function for adding up the results across chunks. Equivalent to ``lambda a,b: a+b`` except takes advantage of ``fill_value`` in pd.Series.add""" return previous_result.add(new_result, fill_value=0)
#print tfidf.head(5) #print idf.size tfquery = Series() linea = "Armed Robbery Suspect Arrested w/ Handgun" # QUERY A EVALUAR linea = linea.upper() tokens = linea.split() for word in tokens: if word in tfidf.columns: print word if word in tfquery: tfquery[word] = tfquery[word] + 1 else : tfidf[word] = 0 test = Series({word : 1}) tfquery = tfquery.add(test, fill_value=0) tfquery = tfquery/len(tokens) tfquery = tfquery.multiply(idf,fill_value = 0) print "TERMINO TFIDF" vectorTFIDF = tfquery.as_matrix() distancias = [] for i , f in tfidf.iterrows(): #distancias.append(dist(vectorTFIDF,f.as_matrix())) distancias.append(1 - spatial.distance.cosine(f.as_matrix(), vectorTFIDF)) #b = numpy.argsort(distancias) distancias = sorted(distancias,reverse=True) print distancias[0:100]