def test_nested_scope(self): engine = self.engine parser = self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected)
def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test x = 1 # noqa result = pd.eval('x + 1', engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser with pytest.raises(SyntaxError): df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): df.query('(df>0) & (df2>0)', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result)
def _squared_distance(s1, s2, a=0, b=1): expr = '((s2-s1)-a)**2*b**2' # PANDAS BUG? # return pandas.eval(expr, engine=None) try: return pandas.eval(expr, engine='numexpr') except ImportError: return pandas.eval(expr, engine='python')
def _linear_distance(s1, s2, a=0, b=1): expr = 'abs(((s2-s1)-a)*b)' # PANDAS BUG? # return pandas.eval(expr, engine=None) try: return pandas.eval(expr, engine='numexpr') except ImportError: return pandas.eval(expr, engine='python')
def _haversine_distance(lat1, lng1, lat2, lng2): # degrees to radians conversion to_rad = 1/360*np.pi*2 # numeric expression to use with numexpr package expr = '2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))' # PANDAS BUG? # return pandas.eval(expr, engine=None) try: return pandas.eval(expr, engine='numexpr') except ImportError: return pandas.eval(expr, engine='python')
def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) result = pd.eval(slobj, local_dict=self.env, engine=self.engine, parser=self.parser) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance lhs = pd.eval(value, local_dict=self.env, engine=self.engine, parser=self.parser) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env)
def evaluate(self, env, engine, parser, term_type, eval_in_python): """Evaluate a binary operation *before* being passed to the engine. Parameters ---------- env : Scope engine : str parser : str term_type : type eval_in_python : list Returns ------- term_type The "pre-evaluated" expression as an instance of ``term_type`` """ if engine == "python": res = self(env) else: # recurse over the left/right nodes left = self.lhs.evaluate( env, engine=engine, parser=parser, term_type=term_type, eval_in_python=eval_in_python ) right = self.rhs.evaluate( env, engine=engine, parser=parser, term_type=term_type, eval_in_python=eval_in_python ) # base cases if self.op in eval_in_python: res = self.func(left.value, right.value) else: res = pd.eval(self, local_dict=env, engine=engine, parser=parser) name = env.add_tmp(res) return term_type(name, env=env)
def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame(np.random.randn(10, 2), columns=list('ab')) dict1 = {'a': 1} dict2 = {'b': 2} assert (df.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b']) assert (pd.eval('a + b', resolvers=[dict1, dict2]) == dict1['a'] + dict2['b'])
def _haversine_distance(lat1, lng1, lat2, lng2): # degrees to radians conversion to_rad = np.deg2rad(1) # numeric expression to use with numexpr package expr = '2*6371*arcsin(sqrt((sin((lat2*to_rad-lat1*to_rad)/2))**2+cos(lat1*to_rad)*cos(lat2*to_rad)*(sin((lng2*to_rad-lng1*to_rad)/2))**2))' return pandas.eval(expr)
def _step_sim(d, offset=0, origin=0): # scale is not an argument if offset < 0: raise ValueError("The offset must be positive.") expr = 'abs(d - origin) <= offset' return pandas.eval(expr).astype(np.int64)
def _check_rule(data, rule): data = deepcopy(data) exclude_column = 'exclude-' + rule['alertName'] idx = data[exclude_column] != True data = data[idx] check = pd.eval(rule['formula']) if check.any(): campaigns = list(set(data[check].campaign)) msg = make_alert_msg(rule['alertName'], campaigns) return msg
def _linear_sim(d, scale, offset=0, origin=0): if offset < 0: raise ValueError("The offset must be positive.") if scale <= 0: raise ValueError("The scale must be larger than 0. ") d = (abs(d - origin)).clip(offset, offset + 2 * scale) expr = '1 - (d-offset)/(2*scale)' return pandas.eval(expr)
def _gauss_sim(d, scale, offset=0, origin=0): if offset < 0: raise ValueError("The offset must be positive.") if scale <= 0: raise ValueError("The scale must be larger than 0. ") d = (abs(d - origin)).clip(offset, None) # solve y=exp(-x^2*a) if 1/2 = exp(-x^2/scale^2) expr = '2**(-((d-offset)/scale)**2)' return pandas.eval(expr)
def _case(case): as_name, else_stmt, stmts = \ case['as_name'], case.get('else_stmt', None), case['stmts'] # make a copy of a column in the data frame and use it as a base col = self._curr_val.iloc[:, 0].copy() if else_stmt is not None: else_val = _get_val(*else_stmt) col.loc[:] = else_val else: # default to NULL as no else val specified col.loc[:] = None for (ev_str, identifiers), stmt in stmts: print ev_str idx = pd.eval(ev_str, local_dict=id_dict(identifiers)) val = _get_val(*stmt) col[idx] = val[idx] self._curr_val[as_name] = col
def _squared_sim(d, scale, offset=0, origin=0): if offset < 0: raise ValueError("The offset must be positive.") if scale <= 0: raise ValueError("The scale must be larger than 0. ") d = (abs(d - origin)).clip(offset, offset + np.sqrt(2) * scale) # solve y=1-ad^2 given y(d=scale)=0.5 # 1-y = ad^2 # a = (1-y)/d^2 # fill y=0.5 and d = scale # a = (1-0.5)/scale^2 # a = 1/(2*scale^2) # y = 1 - 1/2*(d/scale)^2 # d = sqrt(2)*scale is the point where similarity is zero. expr = '1 - 1/2*exp(2*log((d-offset)/scale))' return pandas.eval(expr)
def _1d_distance(s1, s2): return pandas.eval("s2-s1")
def _operation(op): as_name, expr = op['as_name'], op['expr'] ev_str, identifiers = expr col = pd.eval(ev_str, local_dict=id_dict(identifiers)) self._curr_val[as_name] = col
def time_eval_frame_chained_cmp_all_threads(self): pd.eval('df < df2 < df3 < df4')
def testit(): a, b = 1, 2 # noqa res = pd.eval('a + b', engine=engine, parser=parser) assert res == 3
def testit(): a, b = 1, 2 res = pd.eval('a + b', engine=engine, parser=parser) tm.assert_equal(res, 3)
def time_eval_frame_and_all_threads(self): pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
ax = data.plot(style=['-', '--', ':']) ax.lines[0].set_alpha(0.3) #get the formation of date index dataA.index.time dataA.indx.dayofweek dataA.index.weekday ###High-performance Pandas: eval(), query() #eval():string expressions to efficiently compute operations using DataFrames rng = np.random.RandomState(42) df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5)) #arithmetic operators result1 = -df1 * df2 / (df3 + df4) - df5 result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5') np.allclose(result1, result2) #same result #comparison: include chained comparision result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4) result2 = pd.eval('df1 < df2 <= df3 != df4') #Bitwise:& and | result2 = pd.eval('(df1<0.5) & (df2<0.5) | (df3<df4)') #use of the literal and and or in Boolean expressions: result2 = pd.eval('(df1<0.5) and (df2<0.5) or (df3<df4)') #Object attributes and indices. result2 = pd.eval('df2.T[0] + df3.iloc[1]') #DF.eval() for columns-wise operation df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C']) result2 = df.eval('(A+B)/(C-1)')
def time_add(self, engine, threads): pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine)
#%% import pandas as pd import numpy as np # %% nrows, ncols = 1000000, 100 df_A, df_B, df_C, df_D = (pd.DataFrame(np.random.randn(nrows, ncols)) for i in range(4)) # %% %timeit df_A + df_B + df_C + df_D # %% %timeit pd.eval('df_A + df_B + df_C + df_D') # %% df = pd.read_csv('/home/bpt-pandas/data/hotel_bookings.csv') # %% df.query('adults >2 and lead_time < 40') # %% fifa_df = pd.read_csv("/home/bpt-pandas/data/fifa-data.csv", usecols=["Name", "Age", "Nationality", "Club", "Overall", "Value", "Wage"]) fifa_df #%% fifa_df[['Value', 'Wage']] = fifa_df[['Value', 'Wage']] \ .apply(lambda s: s.replace('[\€,)]','', regex=True)) fifa_df['Value'] = fifa_df['Value'].replace({'K':'*1e3', 'M':'*1e6'}, regex=True).map(pd.eval).astype(float)
def to_df(self): """ returns a pandas DataFrame object based on parsed data from a Commodity object's HTML """ try: df = pd.read_html(self.response) df = df[0] # Ignore footer table if S.DBG_ICOM: df.to_csv(S.WORK_DIR + "/" + self.name + ".inf") price = df['Price'][0] # print self.name, type(price), price if math.isnan(price): # No result found return None df["Date"] = pd.to_datetime(df["Date"]) df.insert(0, "Commodity", np.nan) df["Commodity"] = self.name df.insert(6, "Close", np.nan) df["Close"] = df["Price"] df.insert(7, "Volume", np.nan) if self.name.startswith('USD'): df['Volume'] = 0 elif self.name.startswith('FTFBM'): df['Volume'] = df["Vol."] else: mp = {'K': ' * 10**3', 'M': ' * 10**6'} # vol = df['Vol.'][0] # print type(vol), vol df['Vol.'] = df['Vol.'].replace('-', '0.1K') df['Vol.'] = df['Vol.'].replace( 0, '0.1K') # replace all 0 vol with 100 shares ''' Convert k to 1000 and m to 1000000 Important: Can only support max 5 months of EOD to convert ''' df["Volume"] = pd.eval(df["Vol."].replace( mp.keys(), mp.values(), regex=True).str.replace(r'[^\d\.\*]+', '')) df.drop('Price', axis=1, inplace=True) df.drop('Change %', axis=1, inplace=True) if 'Vol.' in df.columns: # FOREX has no "Vol." column df.drop('Vol.', axis=1, inplace=True) df.sort_values(by='Date', inplace=True) except ValueError as ve: df = 'ValueError' self.csverr = self.name + ": ValueError (No data for date range) " + ' (' + str( ve) + ')' if S.DBG_ICOM: with open(S.WORK_DIR + "value.err", 'ab') as f: f.write('\n=============================\n') f.write(self.name + "\n") f.write(self.response) except Exception as e: # This happens when records being processed are larger than 3 months data, # try reducing the period if S.DBG_ICOM: with open(S.WORK_DIR + "value.err", 'ab') as f: f.write('\n=============================\n') f.write(self.name + "\n") f.write(self.response) self.csverr = self.name + ":" + self.start + "," + self.end + ":" + str( e) df = 'Exception' # raise e return df
def update(df, formulas): for k, v in formulas.items(): df[k] = pd.eval(v)
def test_invalid_numexpr_version(engine, parser): if engine == "numexpr": pytest.importorskip("numexpr") a, b = 1, 2 # noqa:F841 res = pd.eval("a + b", engine=engine, parser=parser) assert res == 3
def eval(self, lope, rope): return pd.eval('lope / rope')
EDCTN.hist(bins=30, figsize=(40, 35), xrot=90) plt.suptitle('25 and Older Education Obtained', va='baseline', size=32) plt.rcParams.update({'font.size': 16}) # In[162]: POVERTY.drop('StateFIPS', axis=1).hist(bins=30, figsize=(20, 15), xrot=90) plt.suptitle('Those Living Under $30,962 per Year', va='baseline', size=24) plt.rcParams.update({'font.size': 16}) # In[160]: POV_CALC = POVERTY.drop(['StateFIPS', 'Prcnt_UND_30962'], axis=1) POV_POP = pd.merge(POV_CALC, POP, on='state', how='outer') POV_POP['under_thrsh_pr100k'] = pd.eval( '(POV_POP.Under_30962/POV_POP.POP_2018)*100000') POV_POP['under_thrsh_pr100k'].drop(POV_POP.index[39]).hist(bins=30, figsize=(20, 15), xrot=90) # In[151]: columns = [ "state", "high_school_dipl", "No_schooling", "Master_degree", "Associate_degree", "Bachelor_degree", "Doctorate" ] columns2 = [ 'state', 'HighSchool_pr100k', 'NoSch_pr100k', 'MasterDeg_pr100k', 'AssociateDeg_pr100k', 'BachDeg_pr100k', 'Doctorate_pr100k' ] TRG_EDCTN = EDUCATION.loc[:, columns]
print(max_expr, '\n') ''' The benefit here is that Numexpr evaluates the expression in a way that does not use full-sized temporary arrays, and thus can be much more efficient than NumPy, especially for large arrays. ''' print('pandas.eval() for Efficient Operations:\n') nrows, ncols = 100000, 100 df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4)) t1 = time.time() stmt = df1 + df2 + df3 + df4 t2 = time.time() print('Total time taken = ', t2-t1) t1 = time.time() stmt1 = pd.eval('df1 + df2 + df3 + df4') t2 = time.time() print('Total time taken pd.eval= ', t2-t1) # numpy.allclose: returns true if two arrays are element-wise equal within a tolerance! print('Operations supported by pd.eval(): supports a wide range of operations. \n') results = pd.eval('df1 < df2 <= df3 != df4') print('Object attributes and indices') print('pd.eval() supports access to object attributes via the obj.attr syntax, and indexes via the obj[index] syntax:\n') results2 = pd.eval('df2.T[0] + df3.iloc[1]') ''' Other operations such as function calls, conditional statements, loops, and other more involved constructs are currently not implemented in pd.eval(). ''' print('DataFrame.eval() for Column-Wise Operations:\n') print('The benefit of the myDf.eval() method is that columns can be referred to by name. ') df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
def score_gekko(team_projections, team_id, opponent_scoring, scoring_categories, date_range, roster_makeup, date_last_use_actuals=None, roster_change_set=None, actual_scores=None): # if we don't have any actuals, lets return 0 for all stats if actual_scores is None: actual_scores = defaultdict(lambda: 0) m = GEKKO(remote=False, server='http://localhost:8083') m.options.SOLVER = 1 # with roster changes we make changes, so let's copy the projections current_projections = team_projections.copy() # projections for players who may play. changes with roster changes during period projections_with_added_players = team_projections.copy() rewards = defaultdict(list) player_vars = defaultdict(dict) rc_dict = defaultdict(list) if roster_change_set: rc_dict = _roster_changes_as_day_dict(roster_change_set) for game_day_idx, game_day in enumerate(date_range): for rc in rc_dict[game_day.date()]: # TODO should really figure out how to deal with this. sometimes it is string, sometimes list. # i think has to do with serializing via jsonpickle with suppress(Exception): rc.in_projections['eligible_positions'] = pd.eval( rc.in_projections['eligible_positions']) # add player in projections to projection dataframe current_projections = current_projections.append(rc.in_projections) projections_with_added_players = projections_with_added_players.append( rc.in_projections) current_projections.drop(rc.out_player_id, inplace=True) # current_projections.sort_values(by='fpts', ascending=False, inplace=True) teams_playing_today = nhl_scraper._teams_playing_one_day( game_day.to_pydatetime().date()) game_day_players = current_projections[ current_projections.team_id.isin(teams_playing_today)] scores = {} players = {} for position in roster_makeup.keys(): available_for_position = game_day_players[ game_day_players.eligible_positions.map( set([position]).issubset)] if len(available_for_position) > 0: players[position] = list(available_for_position.index) scores[position] = available_for_position[scoring_categories] vars_by_player = defaultdict(list) for position in players: for player in players[position]: player_var = m.Var(1, 0, 1, True, name=f"{game_day_idx}_{position}_{player}") if position not in player_vars[game_day_idx]: player_vars[game_day_idx][position] = [] player_vars[game_day_idx][position].append(player_var) vars_by_player[player].append(player_var) for category in scoring_categories: rewards[category].append( player_var * game_day_players.loc[player, category]) # limit amount players to roster size allowed for position if position in player_vars[game_day_idx]: m.Equation( m.sum(player_vars[game_day_idx][position]) <= roster_makeup[position]) # for players with multiple eligible positions, make sure only appear once for player_id, player_info in game_day_players.iterrows(): eligible_positions = player_info.eligible_positions if len(eligible_positions) > 1: m.Equation(m.sum(vars_by_player[player_id]) <= 1) for category in scoring_categories: m.Obj(-1 * (1 / (1 + e**(-(m.sum(rewards[category]) + actual_scores[category] - opponent_scoring[category]))))) result = None try: m.solve(disp=False) except Exception as ex: print('Exception') logger.exception("Exception in gekko scoring", ex) time.sleep(1) try: m.solve() except Exception as ex2: logger.exception("Exception in retry gekko scoring", ex2) raise ex2 rostered_players = [] for game_day_idx, game_day in enumerate(date_range): for position in roster_makeup.keys(): if position in player_vars[game_day_idx]: for player in player_vars[game_day_idx][position]: if player.value[0] == 1: attrs = player.name.split("_") player_id = int(attrs[-1]) position = player.name.split("_")[-2].upper() rostered_players.append( [player_id, position, 'p', game_day]) results = pd.DataFrame( rostered_players, columns=['player_id', 'rostered_position', 'score_type', 'play_date']) results = results.join(projections_with_added_players[scoring_categories], on='player_id') return results
def time_chained_cmp(self, engine, threads): pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine)
}, inplace=True) for comb in range(combinations): #print(comb) if comb == 0: campos = [ i for i in range(len(stage[period_back].columns) - 1) ] campos.pop(0) for i in range(1, len(campos) - 2, 3): campos.remove(i) print(campos) campos_print = campos[:] campos_print.insert(0, campos[-2]) campos_print.pop(-2) stage2 = stage[period_back].iloc[:, pd.eval(campos_print)] stage2.to_csv('S:\proyecto2\csv\SCENES\scene' + frec_scene + '-' + str( (max_period - period_back + 1)) + '_' + desc[comb] + '.csv', index=False) if comb == 1: for i in range(3, campos[-1] - 1, 3): campos.remove(i) campos_print = campos[:] campos_print.insert(0, campos[-2]) campos_print.pop(-2) stage2 = stage[period_back].iloc[:, pd.eval(campos_print)] stage2.to_csv('S:\proyecto2\csv\SCENES\scene' + frec_scene + '-' + str(
def _signal(self, df_sys, fast, slow, trade="buy", df_price=None, signal="entry", price="open"): """ A function to generate entry and exit signals individually based on two signals crossing one another. The direction in which the cross is evaluated is defined by the `trade` value ('buy' or 'sell') Parameters ---------- df_sys : pandas.core.frame.DataFrame A pandas dataframe indexed by timestamp at consisten intervals containing two columns each containing a signal's values. fast : str The column label for the signal defined by a short time frame moving average. slow : str The column label for the signals defined by a long timeframe moving average. trade : {'buy', 'sell'} The trade directions against which the signal crosses should be evaluated against. df_price : pandas.core.frame.DataFrame The dataframe containing the entry or exit price that should be matched against the given timestamp directly following the session in which the signal cross occured. signal : {'entry', 'exit'} Whether to generate price points for a trades entry or exit. price : str The column label for the chose entry or exit price. Returns ------- pandas.core.frame.DataFrame A dataframe with a series of entry or exit prices that are generated by two signals crossing one another. Notes ----- Optimisations yield the following improvements: 1. 1min 28s ± 5.46 s per loop (mean ± std. dev. of 7 runs, 1 loop each) 2. 53.4 s ± 1.12 s per loop (mean ± std. dev. of 7 runs, 1 loop each) 3. 10.8 s ± 400 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) """ if trade == "buy": df_sys.eval(f"sys = {fast} > {slow}", inplace=True) elif trade == "sell": df_sys.eval(f"sys = {fast} < {slow}", inplace=True) df_sys["prev_sys"] = df_sys["sys"].shift(2) df_sys["curr_sys"] = df_sys["sys"].shift(1) if signal == "entry": df_sys[signal] = pd.eval( "(df_sys.curr_sys == 1) & (df_sys.prev_sys == 0)") elif signal == "exit": df_sys[signal] = pd.eval( "(df_sys.curr_sys == 0) & (df_sys.prev_sys == 1)") df_sys.index.rename("timestamp", inplace=True) en_ex_prep = df_sys[df_sys[signal] == True][signal].copy().reset_index() del df_sys # datetime column label differs due to different setups. en_ex_price = en_ex_prep.merge(df_price[price], how="left", left_on="timestamp", right_index=True, validate="1:1") s = en_ex_price.rename( columns={ "timestamp": f"{signal}_dt", f"{price}": f"{signal}_price", f"{signal}": f"{signal}_type" }) return s
def check_raise_on_panel4d_with_multiindex(self, parser, engine): tm.skip_if_no_ne() p4d = tm.makePanel4D(7) p4d.items = tm.makeCustomIndex(len(p4d.items), nlevels=2) with tm.assertRaises(NotImplementedError): pd.eval('p4d + 1', parser=parser, engine=engine)
def time_eval_frame_and_python(self): pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
x x = rng.rand(1E6//4) x #pandas.eval() import pandas as pd nrows, ncols = 100000, 100 nrows, ncols rng = np.random.RandomState(42) rng df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4)) df1.head() %timeit df1 + df2 + df3 + df4 %timeit pd.eval('df1+df2+df3+df4') np.allclose(df1+df2+df3+df4, pd.eval('df1+df2+df3+df4')) #Arithmetic Operators #Comparison #Bitwise #Object Attributes #Columnwise Operations
def time_eval_frame_add_one_thread(self): pd.eval('df + df2 + df3 + df4')
def load_csv(filename, features_list, mode): df = pd.read_csv(filename, header=12, dtype=str) # df = df.loc[:, data_list] df.dropna(axis=0, how='any', inplace=True) # minute, second seperation: df['Program time'] = df['Program time'].str.split(':') df[['minute', 'second']] = pd.DataFrame(df['Program time'].values.tolist(), index=df.index) df['new_set'] = pd.eval( 'df.minute.str.contains("00") and df.second.str.contains("00.0")', engine='python') df.drop(columns=['Program time'], inplace=True) if (mode == 0): search_list = ['AhCha', 'AhDch', 'Temp', 'minute', 'second'] elif (mode == 1): search_list = ['Current', 'Voltage', 'Temp', 'minute', 'second'] elif (mode == 2): search_list = [ 'Current', 'Voltage', 'AhCha', 'AhDch', 'Temp', 'minute', 'second' ] df[search_list] = df[search_list].apply(pd.to_numeric, errors='raise') df[search_list] = df[search_list].astype('float64', copy=True, errors='raise') df.reset_index(drop=True, inplace=True) # get index of new sets # this indexes to the next needs to be added with 30 secs (see above) set_index = df.index[df['new_set']].tolist() del set_index[0:3] second_increment = [ round(df['second'][i - 1] - df['second'][i - 2], 2) for i in set_index ] # index of new sets and everything after, increase by second_increment for index in range(len(second_increment)): if (index != len(second_increment) - 1): df['second'][set_index[index]:set_index[index + 1]] = df['second'][ set_index[index]:set_index[index + 1]] + second_increment[index] else: df['second'][set_index[index]:] = df['second'][ set_index[index]:] + second_increment[index] # now, we form a list of new sets - 1 prev_index = [i - 1 for i in set_index] # take values of indexes new set - 1, add to indexes next set and everything after seconds_summation = [(df['minute'][i] * 60) + df['second'][i] for i in prev_index] for index in range(len(seconds_summation)): df['second'][set_index[index]:] = df['second'][ set_index[index]:] + seconds_summation[index] # finally, convert all minutes to seconds df['second'] = df['second'] + df['minute'] * 60 # do some clean-ups df.drop(columns=['minute', 'new_set'], inplace=True) df['Amb'] = df['Temp'].min() if (mode == 0): df = df[['second', 'AhCha', 'AhDch', 'Amb', 'Temp']] elif (mode == 1): df = df[['second', 'Current', 'Voltage', 'Amb', 'Temp']] elif (mode == 2): df = df[[ 'second', 'Current', 'Voltage', 'AhCha', 'AhDch', 'Amb', 'Temp' ]] df.columns = features_list return df
def time_eval_frame_mult_python_one_thread(self): pd.eval('df * df2 * df3 * df4', engine='python')
def time_eval_frame_chained_cmp_python(self): pd.eval('df < df2 < df3 < df4', engine='python')
def _where(cond): ev_str, identifiers = cond index = pd.eval(ev_str, local_dict=id_dict(identifiers)) self._curr_val = self._curr_val[index]
def test_raise_on_panel_with_multiindex(self, parser, engine): p = tm.makePanel(7) p.items = tm.makeCustomIndex(len(p.items), nlevels=2) with pytest.raises(NotImplementedError): pd.eval('p + 1', parser=parser, engine=engine)
def time_eval_frame_mult_one_thread(self): pd.eval('df * df2 * df3 * df4')
def _eval(self, key, where): match = arithOp.search(where) column = where[:match.start()].strip() other = where[match.start():] c = self[key + "/" + column] return c[pd.eval("c" + other)].index
def time_eval_frame_mult_all_threads(self): pd.eval('df * df2 * df3 * df4')
def time_and(self, engine, threads): pd.eval('(self.df > 0) & (self.df2 > 0) & ' '(self.df3 > 0) & (self.df4 > 0)', engine=engine)
def time_eval_frame_add_python_one_thread(self): pd.eval('df + df2 + df3 + df4', engine='python')
def time_mult(self, engine, threads): pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine)
mask = (x>0.5) & (y<0.5) tmp1 = (x>0.5) tmp2 = (y<0.5) mask = tmp1 & tmp2 import numexpr mask_numexpr = numexpr.evaluate('(x>0.5) & (y<0.5)') np.allclose(mask, mask_numexpr) nrows, ncols = 10000, 100 rng = np.random.RandomState(42) df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4)) df1.head() df1 + df2 + df3 + df4 pd.eval('df1 + df2 + df3 + df4') df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0,1000, (100,3))) for i in range(5)) result1 = -df1*df2/(df3+df4) - df5 result2 = pd.eval('-df1*df2/(df3+df4) - df5') np.allclose(result1, result2) result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4) result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)') np.allclose(result1, result2) result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)') np.allclose(result1, result3) result1 = df2.T[0] + df2.iloc[1] result2 = pd.eval('df2.T[0] + df2.iloc[1]') np.allclose(result1, result2) df = pd.DataFrame(rng.rand(1000, 3), columns = ['A', 'B', 'C']) df.head()
def time_eval_frame_add_all_threads(self): pd.eval('df + df2 + df3 + df4')
def load_data(hp, mode="train", audio_extension='.wav'): '''Loads data Args: mode: "train" / "validation" / "synthesis" / "demo". ''' assert mode in ('train', 'synthesis', 'validation', 'demo') logging.info('Start loading data in mode: %s' % (mode)) get_speaker_codes = (hp.multispeaker != [] ) ## False if hp.multispeaker is empty list dataset_df_path = os.path.join(hp.featuredir, 'dataset_' + mode + '.csv') # In demo mode, we change the "dataset" with only one line each time and do not want to use always the same df if False: #os.path.exists(dataset_df_path) and mode != 'demo': dataset_df = pd.read_csv(dataset_df_path) dataset = {} #import pdb;pdb.set_trace() # this does not work in train mode because of problem with doing pd.eval() with bytes try: dataset['texts'] = np.array( [pd.eval(e) for e in dataset_df['texts'].tolist()]) except AttributeError: #that is why we do this dataset['texts'] = np.array( [ast.literal_eval(e) for e in dataset_df['texts'].tolist()]) # I think this cause an error when trying training: # tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to DecodeRaw has length 105 that is not a multiple of 4, the size of int32 dataset['fpaths'] = dataset_df['fpaths'].tolist( ) ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist dataset['text_lengths'] = dataset_df['text_lengths'].tolist( ) ## only used in training (where length information lost due to string format) - TODO: good motivation for this format? dataset['audio_lengths'] = dataset_df['audio_lengths'].tolist( ) ## might be [] dataset['label_lengths'] = dataset_df['label_lengths'].tolist( ) ## might be [] if get_speaker_codes: dataset['speakers'] = dataset_df['speakers'].tolist() if hp.use_external_durations: dataset['durations'] = dataset_df['durations'].tolist() return dataset else: if mode in ['synthesis', 'demo']: get_speaker_codes = False ## never read speaker from transcript for synthesis -- take user-specified speaker instead # Load vocabulary char2idx, idx2char = load_vocab(hp) if mode in ["train", "validation"]: transcript = os.path.join(hp.transcript) elif mode == 'synthesis': transcript = os.path.join(hp.test_transcript) else: transcript = './demo/transcript.csv' if hp.multispeaker: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) fpaths, text_lengths, texts, speakers, durations = [], [], [], [], [] audio_lengths, label_lengths = [], [] lines = codecs.open(transcript, 'r', 'utf-8').readlines() too_long_count_frames = 0 too_long_count_text = 0 no_data_count = 0 nframes = 0 ## default 'False' value for line in tqdm(lines, desc='load_data'): line = line.strip('\n\r |') if line == '': continue fields = line.strip().split("|") assert len(fields) >= 1, fields if len(fields) > 1: assert len(fields) >= 3, fields fname = fields[0] if len(fields) > 1: unnorm_text, norm_text = fields[1:3] else: norm_text = None # to test if audio only if hp.validpatt: if mode == "train": if hp.validpatt in fname: continue elif mode == "validation": if hp.validpatt not in fname: continue if len(fields) >= 4: phones = fields[3] if norm_text is None: letters_or_phones = [ ] # [0] ## dummy 'text' (1 character of padding) where we are using audio only elif hp.input_type == 'phones': if 'speaker_dependent_phones' in hp.multispeaker: speaker_code = speaker else: speaker_code = '' phones = phones_normalize( phones, char2idx, speaker_code=speaker_code ) # in case of phones, all EOS markers are assumed included letters_or_phones = [char2idx[char] for char in phones] elif hp.input_type == 'letters': text = text_normalize(norm_text, hp) + "E" # E: EOS letters_or_phones = [char2idx[char] for char in text] text_length = len(letters_or_phones) if text_length > hp.max_N: #print('number of letters/phones for %s is %s, exceeds max_N %s: skip it'%(fname, text_length, hp.max_N)) too_long_count_text += 1 continue if mode in ["train", "validation"] and os.path.exists( hp.coarse_audio_dir): mel = "{}/{}".format(hp.coarse_audio_dir, fname + ".npy") if not os.path.exists(mel): logging.debug('no file %s' % (mel)) no_data_count += 1 continue nframes = np.load(mel).shape[0] if nframes > hp.max_T: #print('number of frames for %s is %s, exceeds max_T %s: skip it'%(fname, nframes, hp.max_T)) too_long_count_frames += 1 continue audio_lengths.append(nframes) texts.append(np.array(letters_or_phones, np.int32)) fpath = os.path.join(hp.waveforms, fname + audio_extension) fpaths.append(fpath) text_lengths.append(text_length) ## get speaker before phones in case need to get speaker-dependent phones if get_speaker_codes: assert len(fields) >= 5, fields speaker = fields[4] speaker_ix = speaker2ix[speaker] speakers.append(np.array(speaker_ix, np.int32)) if hp.merlin_label_dir: ## only get shape here -- get the data later try: label_length, label_dim = np.load("{}/{}".format( hp.merlin_label_dir, basename(fpath) + ".npy")).shape except TypeError: label_length, label_dim = np.load("{}/{}".format( hp.merlin_label_dir, basename(fpath.decode('utf-8')) + ".npy")).shape label_lengths.append(label_length) assert label_dim == hp.merlin_lab_dim if hp.use_external_durations: assert len(fields) >= 6, fields duration_data = fields[5] duration_data = [ int(value) for value in re.split('\s+', duration_data.strip(' ')) ] duration_data = np.array(duration_data, np.int32) if hp.merlin_label_dir: duration_data = duration_data[ duration_data > 0] ## merlin label contains no skipped items assert len(duration_data) == label_length, ( len(duration_data), label_length, fpath) else: assert len(duration_data) == text_length, ( len(duration_data), text_length, fpath) if nframes: assert duration_data.sum() == nframes * hp.r, ( duration_data.sum(), nframes * hp.r) durations.append(duration_data) # !TODO! check this -- duplicated!? # if hp.merlin_label_dir: ## only get shape here -- get the data later # label_length, _ = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")).shape # label_lengths.append(label_length) #import pdb;pdb.set_trace() if mode == "validation": if len(texts) == 0: logging.error( 'No validation sentences collected: maybe the validpatt %s matches no training data file names?' % (hp.validpatt)) sys.exit(1) logging.info('Loaded data for %s sentences' % (len(texts))) logging.info('Sentences skipped with missing features: %s' % (no_data_count)) logging.info('Sentences skipped with > max_T (%s) frames: %s' % (hp.max_T, too_long_count_frames)) logging.info( 'Additional sentences skipped with > max_N (%s) letters/phones: %s' % (hp.max_N, too_long_count_text)) if mode == 'train' and hp.n_utts > 0: n_utts = hp.n_utts assert n_utts <= len(fpaths) logging.info('Take first %s (n_utts) sentences for training' % (n_utts)) fpaths = fpaths[:n_utts] text_lengths = text_lengths[:n_utts] texts = texts[:n_utts] if get_speaker_codes: speakers = speakers[:n_utts] if audio_lengths: audio_lengths = audio_lengths[:n_utts] if label_lengths: label_lengths = label_lengths[:n_utts] if mode == 'train': ## Return string representation which will be parsed with tf's decode_raw: texts = [text.tostring() for text in texts] if get_speaker_codes: speakers = [speaker.tostring() for speaker in speakers] if hp.use_external_durations: durations = [d.tostring() for d in durations] if mode in ['validation', 'synthesis', 'demo']: ## Prepare a batch of 'stacked texts' (matrix with number of rows==synthesis batch size, and each row an array of integers) stacked_texts = np.zeros((len(texts), hp.max_N), np.int32) for i, text in enumerate(texts): stacked_texts[i, :len(text)] = text texts = stacked_texts if hp.use_external_durations: stacked_durations = np.zeros((len(texts), hp.max_T, hp.max_N), np.int32) for i, dur in enumerate(durations): duration_matrix = durations_to_hard_attention_matrix(dur) duration_matrix = end_pad_for_reduction_shape_sync( duration_matrix, hp) duration_matrix = duration_matrix[0::hp.r, :] m, n = duration_matrix.shape stacked_durations[i, :m, :n] = duration_matrix durations = stacked_durations dataset = {} dataset['texts'] = texts dataset[ 'fpaths'] = fpaths ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist dataset[ 'text_lengths'] = text_lengths ## only used in training (where length information lost due to string format) - TODO: good motivation for this format? dataset['audio_lengths'] = audio_lengths ## might be [] dataset['label_lengths'] = label_lengths ## might be [] dataset_df = dataset.copy() try: dataset_df['texts'] = dataset_df['texts'].tolist() except: # It is already a list pass try: if len(dataset_df['audio_lengths']) == 0: dataset_df['audio_lengths'] = [0] * len(dataset_df['texts']) if len(dataset_df['label_lengths']) == 0: dataset_df['label_lengths'] = [0] * len(dataset_df['texts']) if not os.path.exists(hp.featuredir): os.makedirs(hp.featuredir) pd.DataFrame.to_csv(pd.DataFrame.from_records(dataset_df), dataset_df_path) except: import pdb pdb.set_trace() if get_speaker_codes: dataset['speakers'] = speakers if hp.use_external_durations: dataset['durations'] = durations return dataset
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, mid=None, bid=None, ask=None, bid_mid_bp=None, ask_mid_bp=None, overwrite_bid_ask=None): # if not (self._check_calculate_benchmark(trade_order_name=trade_order_name)): return trade_order_df if mid is None: mid = self._mid if bid is None: bid = self._bid if ask is None: ask = self._ask if bid_mid_bp is None: bid_mid_bp = self._bid_mid_bp if ask_mid_bp is None: ask_mid_bp = self._ask_mid_bp if overwrite_bid_ask is None: overwrite_bid_ask = self._overwrite_bid_ask bid_mid_bp = float(bid_mid_bp) ask_mid_bp = float(ask_mid_bp) # market_df_list = [market_df] if mid not in market_df.columns: market_df[mid] = (market_df[bid].values + market_df[ask].values) / 2.0 # Calculate the bid-mid and ask-mid spreads from market data if bid in market_df.columns and ask in market_df.columns and not ( overwrite_bid_ask): # market_df[bid + '_' + mid + '_spread'] = (market_df[bid].values / market_df[mid].values) - 1.0 # market_df[ask + '_' + mid + '_spread'] = (market_df[mid].values / market_df[ask].values) - 1.0 market_df[bid + '_' + mid + '_spread'] = pd.eval( '(market_df.bid / market_df.mid) - 1.0') market_df[ask + '_' + mid + '_spread'] = pd.eval( '(market_df.mid / market_df.ask) - 1.0') # If we have been asked to overwrite bid/ask columns with an artificial proxy elif bid in market_df.columns and ask in market_df.columns and overwrite_bid_ask: # otherwise if we don't have sufficient bid/ask data (and only mid data), or if we want to forecibly overwrite it, # create a synthetic bid/ask and use the user specified spread market_df[bid + '_' + mid + '_spread'] = -bid_mid_bp / 10000.0 market_df[ask + '_' + mid + '_spread'] = -ask_mid_bp / 10000.0 # market_df[bid] = (market_df[mid].values) * (1.0 - bid_mid_bp / 10000.0) # market_df[ask] = (market_df[mid].values) / (1.0 - ask_mid_bp / 10000.0) # market_df[bid + '_' + mid + '_spread'] = pd.eval('-bid_mid_bp / 10000.0') # market_df[ask + '_' + mid + '_spread'] = pd.eval('-ask_mid_bp / 10000.0') market_df[bid] = pd.eval( '(market_df[mid]) * (1.0 - bid_mid_bp / 10000.0)') market_df[ask] = pd.eval( '(market_df[mid]) / (1.0 - ask_mid_bp / 10000.0)') # If we only have the mid column elif mid in market_df.columns and bid not in market_df.columns and ask not in market_df.columns: market_df[bid + '_' + mid + '_spread'] = -bid_mid_bp / 10000.0 market_df[ask + '_' + mid + '_spread'] = -ask_mid_bp / 10000.0 # market_df[bid] = (market_df[mid].values) * (1.0 - bid_mid_bp / 10000.0) # market_df[ask] = (market_df[mid].values) / (1.0 - ask_mid_bp / 10000.0) # market_df[bid + '_' + mid + '_spread'] = pd.eval('-bid_mid_bp / 10000.0') # market_df[ask + '_' + mid + '_spread'] = pd.eval('-ask_mid_bp / 10000.0') market_df[bid] = pd.eval( '(market_df.mid) * (1.0 - bid_mid_bp / 10000.0)') market_df[ask] = pd.eval( '(market_df.mid) / (1.0 - ask_mid_bp / 10000.0)') else: LoggerManager().getLogger(__name__).warn( "Couldn't calculate spread from mid, check market data has appropriate fields." ) return trade_order_df, market_df