def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) # right = concat([self.right, self.right], ignore_index=True) left['group'] = ['a'] * 3 + ['b'] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 result = ordered_merge(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({ 'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, 'rvalue': [nan, 1, 2, 3, 3, 4] * 2 }) expected['group'] = ['a'] * 6 + ['b'] * 6 assert_frame_equal(result, expected.ix[:, result.columns]) result2 = ordered_merge(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.ix[:, result.columns]) result = ordered_merge(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all())
def _parse_cme_level2_data(fpath, max_level=5): """ parse level 2 data columns are: (time, symbol, is_implied) as index + buy_depth, sell_depth, + for 1 <= i <= 10 and side in ['buy', 'sell'] (level_i_price_side, level_i_volume_side, level_i_orders_side) """ index_names = ['time', 'side', 'is_implied'] column_names = ['symbol', 'depth'] + map(lambda x: "level_{}".format(x), range(1, 11)) data = pd.read_csv(fpath, parse_dates=[[0, 1]], date_parser=_convert_time, index_col=[0, 2, 3]) data.index.names = index_names data.columns = column_names if len(data) == 0: raise IOError("File is empty") for i in xrange(1, max_level + 1): d = zip(*data["level_{}".format(i)].apply(lambda s: re.split(r' x | \(|\)', s)[:3]).tolist()) data['level_{}_price'.format(i)] = map(float, d[0]) data['level_{}_volume'.format(i)] = map(int, d[1]) data['level_{}_orders'.format(i)] = map(int, d[2]) data.drop("level_{}".format(i), axis=1, inplace=True) data['symbol'] = data['symbol'].apply(lambda s: s.replace(' ', '')) data = data.reset_index() data = data[data['is_implied'] == 0] buy_data = data[data['side'] == data['side'].values[0]].drop('side', axis=1) sell_data = data[data['side'] != data['side'].values[0]].drop('side', axis=1) data = pd.ordered_merge(buy_data, sell_data, on=['time', 'symbol'], fill_method='ffill', suffixes=['_buy', '_sell']) data.set_index('time', inplace=True) return data
def test_ffill(self): result = ordered_merge( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], 'rvalue': [nan, 1, 2, 3, 3, 4]}) assert_frame_equal(result, expected)
def test_basic(self): result = ordered_merge(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) assert_frame_equal(result, expected)
def test_basic(self): result = ordered_merge(self.left, self.right, on='key') expected = DataFrame({ 'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4] }) assert_frame_equal(result, expected)
def test_ffill(self): result = ordered_merge(self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({ 'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], 'rvalue': [nan, 1, 2, 3, 3, 4] }) assert_frame_equal(result, expected)
def simdata(tmax, l, extpars, exttype='const', xmean=0, xvar=100, theta=1, noisevar=1, noiseext=1): df = tempdata(1, l, extpars, exttype, xmean, xvar, theta, noisevar, noiseext) output = pd.Series(sum(df.tv)) for i in range(2, tmax + 1): temp = tempdata(i, l, extpars, exttype, xmean, xvar, theta, noisevar, noiseext) df = pd.ordered_merge(df, temp) df = df[df.death > i] output = output.append(pd.Series(sum(df.tv))) output = pd.DataFrame(output) output.columns = ['TV'] output.index = range(1, len(output) + 1) return output
def merge_data(left_data, right_data, suffixes=('', None)): """ merge two different securities, indexed on left_data """ left_data = left_data.reset_index() right_data = right_data.reset_index() left_suffix, right_suffix = suffixes if right_suffix is None: right_sym = right_data['symbol'].values[0][:2].lower() right_suffix = '_{}'.format(right_sym) merged_data = pd.ordered_merge(left_data, right_data, fill_method='ffill', on='time', suffixes=(left_suffix, right_suffix)).fillna(0).set_index('time') merged_data = merged_data.ix[left_data['time'].values].fillna(0) return merged_data
def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) # right = concat([self.right, self.right], ignore_index=True) left['group'] = ['a'] * 3 + ['b'] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 result = ordered_merge(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) expected['group'] = ['a'] * 6 + ['b'] * 6 assert_frame_equal(result, expected.ix[:, result.columns]) result2 = ordered_merge(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.ix[:, result.columns]) result = ordered_merge(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all())
new_expression = new_expression.append(temp_df) expression = new_expression # #calculating % of gene usage for each isotype # new_expression2 = pd.DataFrame() # for isotype in set(expression['isotype'].tolist()): # temp_df = expression[expression.isotype == isotype] # temp_df['exp_percent_isotype'] = temp_df['average'].divide(temp_df.sum()['average'], axis=0, level=None, fill_value=None) # new_expression2 = new_expression2.append(temp_df) # expression = new_expression2 #getting all alternative (wobble driven) codones in decoding a = wages[['anticodone','codone', 'isotype', 'wage_percent']] b = expression[['anticodone','isotype']] b['gene'] = 'gene' #mark gene encoded anticodones c = pd.ordered_merge(a,b) d = c[c.gene != 'gene'] #leave only wobble-driven codones d = d.sort('isotype') #re-asigning codones to new anticodones and adding wages for some genes wobble_driven_anticodones = list() wobble_dict = {'A':'A','T':'G','G':'U','C':'A', 'a':'a','t':'G','g':'u','c':'a'} for i, row in d.iterrows(): if row['isotype'] != 'Z': anti1 = wobble_dict[row['codone'][2]] anti23 = ReverseComplement(row['codone'])[1:3] d.loc[i, 'recog_ant'] = anti1+anti23 wages.loc[wages.anticodone == anti1+anti23,'wage_percent_correction'] += row['wage_percent'] wages['wages_sum'] = wages['wage_percent']+wages['wage_percent_correction'] # print d d.to_csv('wages', sep='\t')
rng2 = pandas.date_range('1/1/2011 05:00:00', periods=10, freq='H') ts1 = pandas.Series(2*(len(rng)), index=rng) ts2 = pandas.Series(2*(len(rng)), index=rng2) def comparison(a,b): if numpy.isnan(a): return b elif numpy.isnan(b): return a elif a==b: return a else: raise Exception("oops. a={}, b={}".format(a,b)) ts2.combine(ts1, f) """ filename = '/home/jack/workingcopies/domesticPowerData/BellendenRd/version2/channel_99.dat' signal = meterSignal.read_csv(filename, separator=' ', colnames=['watts', 'port', 'cc_channel']) signal2 = pandas.DataFrame(signal) merged = pandas.ordered_merge(signal.watts, signal2.watts) print(signal) fig = plt.figure() ax = fig.add_subplot(1,1,1) meterSignal.plot_signal(merged, ax) fig.autofmt_xdate() plt.show()
def test_deprecation(self): with tm.assert_produces_warning(FutureWarning): pd.ordered_merge(self.left, self.right, on='key')
def fetch_market_data(days=366, window=15, region=10000002, debug=global_debug): global V, desired_stats print 'Fetching market data ...' raw_query = \ '''SELECT itemid, price_date, volume, avgprice FROM crest_markethistory WHERE regionid = %s AND price_date > (SELECT max(price_date) FROM crest_markethistory) - INTERVAL %s DAY ORDER BY itemid, price_date''' % (region, days+30) if debug: raw_query = \ '''SELECT itemid, price_date, volume, avgprice FROM crest_markethistory WHERE regionid = %s AND itemid = 34 AND price_date > (SELECT max(price_date) FROM crest_markethistory) - INTERVAL %s DAY ORDER BY itemid, price_date''' % (region, days+30) V.raw_query = raw_query raw_data = psql.read_sql(raw_query, data_conn, parse_dates=['price_date']) V.raw_data = raw_data expected_dates = pd.DataFrame(raw_data[raw_data.itemid == 34].price_date) expected_dates.index = expected_dates.price_date V.expected_dates = expected_dates raw_data_filled = pd.ordered_merge( raw_data[raw_data.itemid.isin(convert.index)], expected_dates, on='price_date', left_by='itemid' ) raw_data_filled['present'] = raw_data_filled.avgprice / raw_data_filled.avgprice raw_data_filled.fillna({'volume':0}, inplace=True) raw_data_filled['price_delta_sma'] = \ raw_data_filled \ .groupby('itemid') \ .avgprice \ .apply( lambda x: x - pd.rolling_mean( x.interpolate() .fillna(method='bfill'), window ) ) # raw_data_filled['price_delta_sma2'] = raw_data_filled['price_delta_sma'] ** 2 raw_data_filled['price_delta_smm'] = \ raw_data_filled \ .groupby('itemid') \ .avgprice \ .apply( lambda x: x - pd.rolling_median( x.interpolate() .fillna(method='bfill'), window ) ) V.raw_data_filled = raw_data_filled # raw_data_filled['price_delta_smm2'] = raw_data_filled['price_delta_smm'] ** 2 desired_stats = ['volume','price_delta_sma','price_delta_smm'] raw_data_filled.index = raw_data_filled.itemid raw_data_filled = \ raw_data_filled \ .groupby('itemid') \ .filter(lambda x: len(x.index) >= window) \ .groupby('itemid') \ .tail(days) return raw_data_filled.groupby('itemid')
new_expression = new_expression.append(temp_df) expression = new_expression # #calculating % of gene usage for each isotype # new_expression2 = pd.DataFrame() # for isotype in set(expression['isotype'].tolist()): # temp_df = expression[expression.isotype == isotype] # temp_df['exp_percent_isotype'] = temp_df['average'].divide(temp_df.sum()['average'], axis=0, level=None, fill_value=None) # new_expression2 = new_expression2.append(temp_df) # expression = new_expression2 #getting all alternative (wobble driven) codones in decoding a = wages[['anticodone', 'codone', 'isotype', 'wage_percent']] b = expression[['anticodone', 'isotype']] b['gene'] = 'gene' #mark gene encoded anticodones c = pd.ordered_merge(a, b) d = c[c.gene != 'gene'] #leave only wobble-driven codones d = d.sort('isotype') #re-asigning codones to new anticodones and adding wages for some genes wobble_driven_anticodones = list() wobble_dict = { 'A': 'A', 'T': 'G', 'G': 'U', 'C': 'A', 'a': 'a', 't': 'G', 'g': 'u', 'c': 'a' }
def handle_uploaded_file(f, dirname): path = os.path.join('../static/results', dirname) try: os.makedirs(path) except OSError as e: print e print 'unable to create directory ' + path file = f['docfile'] with open(path + '/' + file.name, 'wb+') as destination: for chunk in file.chunks(): destination.write(chunk) # new code for parsing data = pandas.read_csv(path + '/' + file.name) cols = ['SYS.FIL.APP.','PST.EXP.CLD.CP.','PST.EXP.CLD.OT1.','PST.EXP.CLD.OT2.','PST.EXP.BED.CP.','PST.EXP.BED.OT1.', 'PST.EXP.BED.OT2.','TSK.PRB.ANS.CP.','TSK.PRB.ANS.OT1.','TSK.PRB.ANS.OT2.','TSK.CON.CP.','TSK.CON.OT1.', 'TSK.CON.OT2.','TSK.TIME.DIFF.CP.','TSK.TIME.DIFF.OT1.','TSK.TIME.DIFF.OT2.'] workingData = data[cols] workingDataDF = pandas.DataFrame(workingData) workingDataDF[['PST.EXP.CLD.CP.','PST.EXP.CLD.OT1.','PST.EXP.CLD.OT2.','PST.EXP.BED.CP.','PST.EXP.BED.OT1.', 'PST.EXP.BED.OT2.','TSK.PRB.ANS.CP.','TSK.PRB.ANS.OT1.','TSK.PRB.ANS.OT2.','TSK.CON.CP.','TSK.CON.OT1.', 'TSK.CON.OT2.','TSK.TIME.DIFF.CP.','TSK.TIME.DIFF.OT1.','TSK.TIME.DIFF.OT2.']] = np.around(workingDataDF[['PST.EXP.CLD.CP.','PST.EXP.CLD.OT1.','PST.EXP.CLD.OT2.','PST.EXP.BED.CP.','PST.EXP.BED.OT1.', 'PST.EXP.BED.OT2.','TSK.PRB.ANS.CP.','TSK.PRB.ANS.OT1.','TSK.PRB.ANS.OT2.','TSK.CON.CP.','TSK.CON.OT1.', 'TSK.CON.OT2.','TSK.TIME.DIFF.CP.','TSK.TIME.DIFF.OT1.','TSK.TIME.DIFF.OT2.']], 0) tools = pandas.DataFrame(workingData['SYS.FIL.APP.']).drop_duplicates().sort('SYS.FIL.APP.').reset_index(); del tools['index'] tools.columns = ['Tools'] tools.to_json(path_or_buf=path + '/' + "tools.json") metrics = {'load':{'col':'PST.EXP.CLD.CP.','max':5}, 'loadOT1':{'col':'PST.EXP.CLD.OT1.','max':5}, 'loadOT2':{'col':'PST.EXP.CLD.OT2.','max':5}, 'difficulty':{'col':'PST.EXP.BED.CP.','max':10}, 'difficultyOT1':{'col':'PST.EXP.BED.OT1.','max':10}, 'difficultyOT2':{'col':'PST.EXP.BED.OT2.','max':10}, 'performance':{'col':'TSK.PRB.ANS.CP.','max':10}, 'performanceOT1':{'col':'TSK.PRB.ANS.OT1.','max':10}, 'performanceOT2':{'col':'TSK.PRB.ANS.OT2.','max':10}, 'confidence':{'col':'TSK.CON.CP.','max':10}, 'confidenceOT1':{'col':'TSK.CON.OT1.','max':10}, 'confidenceOT2':{'col':'TSK.CON.OT2.','max':10}, 'time':{'col':'TSK.TIME.DIFF.CP.','max':10}, 'timeOT1':{'col':'TSK.TIME.DIFF.OT1.','max':10}, 'timeOT2':{'col':'TSK.TIME.DIFF.OT2.','max':10}} for key, value in metrics.items(): df = pandas.DataFrame({key: workingDataDF.groupby(['SYS.FIL.APP.', value['col']], sort=0, as_index=False).size()}).reset_index() df.columns = ['Tool', 'Range', 'Count'] df = df.sort(['Tool', 'Range'], ascending=[1, 1]) array = [] min = 1 for i in tools.Tools: maxVal = int(value['max']) + 1 for j in range(1,maxVal): subarray = [i, j] array.append(subarray) d = pandas.DataFrame(array, columns=('Tool', 'Range')) result = pandas.ordered_merge(df,d) result.to_csv(path_or_buf=path + '/' + str(key) + '.csv', sep=',',na_rep='0',index=False)
def merge_trades_and_quotes(data): return [pd.ordered_merge(quotes, trades, fill_method='ffill') for quotes, trades in data]