def check_str_query_method(self, parser, engine): tm.skip_if_no_ne(engine) df = DataFrame(randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) expect = df[df.strings == 'a'] if parser != 'pandas': col = 'strings' lst = '"a"' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] eq, ne = '==', '!=' ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) assertRaises(NotImplementedError, df.query, ex, engine=engine, parser=parser, local_dict={'strings': df.strings}) else: res = df.query('"a" == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('strings == "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) assert_frame_equal(res, df[df.strings.isin(['a'])]) expect = df[df.strings != 'a'] res = df.query('strings != "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('"a" != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) assert_frame_equal(res, df[~df.strings.isin(['a'])])
def test_query_with_partially_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) a = np.random.choice(['red', 'green'], size=10) b = np.arange(10) index = MultiIndex.from_arrays([a, b]) index.names = [None, 'rating'] df = DataFrame(np.random.randn(10, 2), index=index) res = df.query('rating == 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind == 1] assert_frame_equal(res, exp) res = df.query('rating != 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind != 1] assert_frame_equal(res, exp) res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind == "red"] assert_frame_equal(res, exp) res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind != "red"] assert_frame_equal(res, exp)
def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test x = 1 # noqa result = pd.eval('x + 1', engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser with pytest.raises(SyntaxError): df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): df.query('(df>0) & (df2>0)', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result)
def test_nested_scope(self): engine = self.engine parser = self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected)
def test_index_resolvers_come_after_columns_with_the_same_name(self): n = 1 # noqa a = np.r_[20:101:20] df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) df.index.name = 'index' result = df.query('index > 5', engine=self.engine, parser=self.parser) expected = df[df['index'] > 5] assert_frame_equal(result, expected) df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) result = df.query('ilevel_0 > 5', engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected) df = DataFrame({'a': a, 'b': np.random.randn(a.size)}) df.index.name = 'a' result = df.query('a > 5', engine=self.engine, parser=self.parser) expected = df[df.a > 5] assert_frame_equal(result, expected) result = df.query('index > 5', engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected)
def test_query_undefined_local(self): from pandas.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.rand(10, 2), columns=list('ab')) with tm.assertRaisesRegexp(UndefinedVariableError, "local variable 'c' is not defined"): df.query('a == @c', engine=engine, parser=parser)
def test_nested_raises_on_local_self_reference(self): from pandas.core.computation.ops import UndefinedVariableError df = DataFrame(np.random.randn(5, 3)) # can't reference ourself b/c we're a local so @ is necessary with pytest.raises(UndefinedVariableError): df.query('df > 0', engine=self.engine, parser=self.parser)
def test_query(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) assert_frame_equal(df.query('a < b', engine=engine, parser=parser), df[df.a < df.b]) assert_frame_equal(df.query('a + b > b * c', engine=engine, parser=parser), df[df.a + df.b > df.b * df.c])
def update_progress( status: dict, settings: dict, progress: pd.DataFrame, queue: pd.DataFrame, passengers: pd.DataFrame, **kwargs ) -> pd.DataFrame: """ :param status: The current status of the simulation :param settings: Configuration settings for the current trial :param progress: The current progress data frame for the snapshot, which will be replaced by the one returned :param queue: The queue data frame for the trial :param passengers: The passengers data frame for the trial """ row = dict( progress=status['progress'], time=status['time'] ) aisle_vacancies = [] for queue_index in range(queue.shape[0]): queue_item = queue.loc[queue_index] passenger_index = queue_item['passenger'] if queue_item['aisle'] >= 0: aisle_vacancies.append(0 if passenger_index is None else 1) if passenger_index is not None: key = 'p_{}'.format(passenger_index) row[key] = 'Q:{}'.format(queue_item['aisle']) seated = passengers.query('delay_interchange > 0') for passenger_index, passenger in seated.iterrows(): key = 'p_{}'.format(passenger_index) row[key] = '{}:{}'.format( 'O' if passenger['seated'] else 'I', passenger['aisle'] ) seated = passengers.query('seated and delay_interchange == 0') for passenger_index, passenger in seated.iterrows(): key = 'p_{}'.format(passenger_index) row[key] = 'S:{}'.format(passenger['aisle']) row['aisle_density'] = sum(aisle_vacancies)/len(aisle_vacancies) return progress.append(row, ignore_index=True)
def test_query_doesnt_pickup_local(self): from pandas.core.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser n = m = 10 df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) # we don't pick up the local 'sin' with pytest.raises(UndefinedVariableError): df.query('sin > 5', engine=engine, parser=parser)
def test_query_undefined_local(self): from pandas.core.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.rand(10, 2), columns=list('ab')) msg = "local variable 'c' is not defined" with pytest.raises(UndefinedVariableError, match=msg): df.query('a == @c', engine=engine, parser=parser)
def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) df['dates1'] = date_range('1/1/2012', periods=n) df['dates3'] = date_range('1/1/2014', periods=n) df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT df.set_index('dates1', inplace=True, drop=True) with pytest.raises(NotImplementedError): df.query('index < 20130101 < dates3', engine=engine, parser=parser)
def test_query_builtin(self): from pandas.core.computation.engines import NumExprClobberingError engine, parser = self.engine, self.parser n = m = 10 df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) df.index.name = 'sin' msg = 'Variables in expression.+' with pytest.raises(NumExprClobberingError, match=msg): df.query('sin > 5', engine=engine, parser=parser)
def test_object_array_eq_ne(self, parser, engine): df = DataFrame({'a': list('aaaabbbbcccc'), 'b': list('aabbccddeeff'), 'c': np.random.randint(5, size=12), 'd': np.random.randint(9, size=12)}) res = df.query('a == b', parser=parser, engine=engine) exp = df[df.a == df.b] assert_frame_equal(res, exp) res = df.query('a != b', parser=parser, engine=engine) exp = df[df.a != df.b] assert_frame_equal(res, exp)
def test_date_query_with_non_date(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame({'dates': date_range('1/1/2012', periods=n), 'nondate': np.arange(n)}) ops = '==', '!=', '<', '>', '<=', '>=' for op in ops: with tm.assertRaises(TypeError): df.query('dates %s nondate' % op, parser=parser, engine=engine)
def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij')) b = 1 expect = df[df.a < b] result = df.query('a < @b', engine=engine, parser=parser) assert_frame_equal(result, expect) expect = df[df.a < df.b] result = df.query('a < b', engine=engine, parser=parser) assert_frame_equal(result, expect)
def test_query_index_with_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), index=Index(range(10), name='blob'), columns=['a', 'b', 'c']) res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) expec = df[(df.index < 5) & (df.a < df.b)] assert_frame_equal(res, expec) res = df.query('blob < b', engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec)
def test_query_index_without_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), index=range(10), columns=['a', 'b', 'c']) # "index" should refer to the index res = df.query('index < b', engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec) # test against a scalar res = df.query('index < 5', engine=engine, parser=parser) expec = df[df.index < 5] assert_frame_equal(res, expec)
def test_local_variable_with_in(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) a = Series(np.random.randint(3, size=15), name='a') b = Series(np.random.randint(10, size=15), name='b') df = DataFrame({'a': a, 'b': b}) expected = df.loc[(df.b - 1).isin(a)] result = df.query('b - 1 in a', engine=engine, parser=parser) assert_frame_equal(expected, result) b = Series(np.random.randint(10, size=15), name='b') expected = df.loc[(b - 1).isin(a)] result = df.query('@b - 1 in a', engine=engine, parser=parser) assert_frame_equal(expected, result)
def QA_fetch_stock_min_adv(code, start, end, type_='1min', if_drop_index=False, collections=QA_Setting.client.quantaxis.stock_min): '获取股票分钟线' if type_ in ['1min', '1m']: type_ = '1min' elif type_ in ['5min', '5m']: type_ = '5min' elif type_ in ['15min', '15m']: type_ = '15min' elif type_ in ['30min', '30m']: type_ = '30min' elif type_ in ['60min', '60m']: type_ = '60min' __data = [] for item in collections.find({ 'code': str(code), "time_stamp": { "$gte": QA_util_time_stamp(start), "$lte": QA_util_time_stamp(end) }, 'type': type_ }): __data.append([str(item['code']), float(item['open']), float(item['high']), float( item['low']), float(item['close']), float(item['vol']), item['datetime'], item['time_stamp'], item['date']]) __data = DataFrame(__data, columns=[ 'code', 'open', 'high', 'low', 'close', 'volume', 'datetime', 'time_stamp', 'date']) __data['datetime'] = pd.to_datetime(__data['datetime']) return QA_DataStruct_Stock_min(__data.query('volume>1').set_index(['datetime', 'code'], drop=if_drop_index))
def from_contract_description(cls, contracts: pd.DataFrame, position, premium, option_type=None, strike_price=None, underlying_asset=None, expiry=None, quantity=1): queries = [] if option_type is not None: if option_type == OptionType.Call: queries.append("Right=='C'") elif option_type == OptionType.Put: queries.append("Right=='P'") if strike_price is not None: queries.append("Strike==" + str(strike_price)) if underlying_asset is not None: queries.append("Symbol=='{}'".format(underlying_asset)) if expiry is not None: queries.append("Symbol=='{}'".format(expiry)) query = None for q in queries: if query is None: query = q else: query = query + " and " + q selected_contract = contracts.query(query) if selected_contract.shape[0] > 1: raise ValueError() else: return cls.from_ConId(contracts, selected_contract.index[0], position, premium, quantity)
def open(self, slug): columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data) fields = columns if self.get_argument('fields', None): fields = self.get_argument('fields').split(',') self.write_message({'type': 'columns', 'data': fields}) filters = [i[0] for i in self.request.arguments.iteritems() if len(i[0].split('filter__')) > 1] df = DataFrame(MyBucket.get(slug).data, columns=fields) if len(filters) >= 1: for f in filters: df = df.query(df_generate(df, self.get_argument, f)) ca = None for e in MyAdminBucket.get('element').data: if e['slug'] == slug: ca = e['categories'] categories = [] for i in df.to_dict(outtype='records'): if ca: categories.append(i[ca]) self.write_message({'type': 'data', 'data': i}) self.write_message({'type': 'categories', 'data': categories}) self.write_message({'type': 'close'})
def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) df = DataFrame({'a': ['a', 'b', 'test & test'], 'b': [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) expec = df[df.a == 'test & test'] assert_frame_equal(res, expec)
def post(self, slug): columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data) fields = columns if self.get_argument('fields', None): fields = self.get_argument('fields').split(',') filters = [i[0] for i in self.request.arguments.iteritems() if len(i[0].split('filter__')) > 1] fields_json = json.dumps(fields) filters_json = json.dumps({f: self.get_argument(f) for f in filters}) if MyCache.get(str(slug)) and\ MyCache.get('{}-columns'.format(slug)) == fields_json and\ MyCache.get('{}-fulters'.format(slug)) == filters_json: self.write(MyCache.get(str(slug))) self.finish() MyCache.set('{}-columns'.format(slug), fields_json) MyCache.set('{}-filters'.format(slug), filters_json) df = DataFrame(MyBucket.get(slug).data, columns=fields) if len(filters) >= 1: for f in filters: df = df.query(df_generate(df, self.get_argument, f)) convert = df.to_dict(outtype='records') write = json.dumps({'columns': fields, 'json': convert}) MyCache.set(str(slug), write) self.write(write) self.finish()
def test_query_single_element_booleans(self, parser, engine): columns = 'bid', 'bidsize', 'ask', 'asksize' data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) res = df.query('bid & ask', engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected)
def test_date_query_with_non_date(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame({'dates': date_range('1/1/2012', periods=n), 'nondate': np.arange(n)}) result = df.query('dates == nondate', parser=parser, engine=engine) assert len(result) == 0 result = df.query('dates != nondate', parser=parser, engine=engine) assert_frame_equal(result, df) for op in ['<', '>', '<=', '>=']: with pytest.raises(TypeError): df.query('dates %s nondate' % op, parser=parser, engine=engine)
def f(self, f): """ Filter trades based on conditions f Any valid pandas dataframe query """ df = DataFrame(self._trades) return df.query(f)
def f(self, query): """ Filter data based on query query A valid pandas dataframe query """ df = DataFrame(self._cash) return df.query(query)
def create_trackway_info( trackway_name: str, trackway_df: pd.DataFrame, **kwargs ) -> dict: """ Creates information about the trackway """ pes_df = trackway_df.query('is_pes == True') manus_df = trackway_df.query('is_pes == False') return dict( trackway=trackway_name, pes_count=len(pes_df), manus_count=len(manus_df) )
def test_at_inside_string(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) c = 1 # noqa df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) result = df.query('a == "@c"', engine=engine, parser=parser) expected = df[df.a == "@c"] assert_frame_equal(result, expected)
def test_query_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(20, 2), columns=list('ab')) a, b = 1, 2 # noqa res = df.query('a > b', engine=engine, parser=parser) expected = df[df.a > df.b] assert_frame_equal(res, expected) res = df.query('@a > b', engine=engine, parser=parser) expected = df[a > df.b] assert_frame_equal(res, expected) # no local variable c with pytest.raises(UndefinedVariableError): df.query('@a > b > @c', engine=engine, parser=parser) # no column named 'c' with pytest.raises(UndefinedVariableError): df.query('@a > b > c', engine=engine, parser=parser)
def _calc_confusion_matrix_terminology(self, user_merged: pd.DataFrame, cutoff: int = None): if self.relevant_threshold is None: relevant_threshold = user_merged['score_truth'].mean() else: relevant_threshold = self.relevant_threshold if cutoff: # We consider as 'not_predicted' also those excluded from cutoff other than those # not effectively retrieved (score_pred is nan) actually_predicted = user_merged.query('score_pred.notna()', engine='python')[:cutoff] not_predicted = user_merged.query('score_pred.notna()', engine='python')[cutoff:] if not user_merged.query('score_pred.isna()', engine='python').empty: not_predicted = pd.concat([not_predicted, user_merged.query('score_pred.isna()', engine='python')]) else: actually_predicted = user_merged.query('score_pred.notna()', engine='python') not_predicted = user_merged.query('score_pred.isna()', engine='python') tp = len(actually_predicted.query('score_truth >= @relevant_threshold')) fp = len(actually_predicted.query('(score_truth < @relevant_threshold) or (score_truth.isna())', engine='python')) tn = len(not_predicted.query('score_truth < @relevant_threshold')) fn = len(not_predicted.query('score_truth >= @relevant_threshold')) return tp, fp, tn, fn
def test_query_with_string_columns(self, parser, engine): df = DataFrame({ "a": list("aaaabbbbcccc"), "b": list("aabbccddeeff"), "c": np.random.randint(5, size=12), "d": np.random.randint(9, size=12), }) if parser == "pandas": res = df.query("a in b", parser=parser, engine=engine) expec = df[df.a.isin(df.b)] tm.assert_frame_equal(res, expec) res = df.query("a in b and c < d", parser=parser, engine=engine) expec = df[df.a.isin(df.b) & (df.c < df.d)] tm.assert_frame_equal(res, expec) else: msg = r"'(Not)?In' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): df.query("a in b", parser=parser, engine=engine) msg = r"'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine)
def test_str_list_query_method(self, parser, engine): df = DataFrame(np.random.randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) expect = df[df.strings.isin(['a', 'b'])] if parser != 'pandas': col = 'strings' lst = '["a", "b"]' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] eq, ne = '==', '!=' ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) with pytest.raises(NotImplementedError): df.query(ex, engine=engine, parser=parser) else: res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('["a", "b"] == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) expect = df[~df.strings.isin(['a', 'b'])] res = df.query('strings != ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('["a", "b"] != strings', engine=engine, parser=parser) assert_frame_equal(res, expect)
def test_query_scope(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(20, 2), columns=list("ab")) a, b = 1, 2 # noqa:F841 res = df.query("a > b", engine=engine, parser=parser) expected = df[df.a > df.b] tm.assert_frame_equal(res, expected) res = df.query("@a > b", engine=engine, parser=parser) expected = df[a > df.b] tm.assert_frame_equal(res, expected) # no local variable c with pytest.raises(UndefinedVariableError, match="local variable 'c' is not defined"): df.query("@a > b > @c", engine=engine, parser=parser) # no column named 'c' with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"): df.query("@a > b > c", engine=engine, parser=parser)
def test_str_query_method(self, parser, engine): df = DataFrame(np.random.randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) expect = df[df.strings == 'a'] if parser != 'pandas': col = 'strings' lst = '"a"' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] eq, ne = '==', '!=' ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) msg = r"'(Not)?In' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): df.query(ex, engine=engine, parser=parser, local_dict={'strings': df.strings}) else: res = df.query('"a" == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('strings == "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) assert_frame_equal(res, df[df.strings.isin(['a'])]) expect = df[df.strings != 'a'] res = df.query('strings != "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('"a" != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) assert_frame_equal(res, df[~df.strings.isin(['a'])])
def select(dataframe: pd.DataFrame, query: str, **where: str) -> pd.DataFrame: query = ' '.join(query.format(**where).splitlines()) return dataframe.query(query).dropna(axis=1, how='all').dropna(axis=0, how='all')
def __trading_energy_generator(df: pd.DataFrame, date: date, duid_id: str, power_field: str = "generated") -> pd.DataFrame: return_cols = [] t_start = datetime(date.year, date.month, date.day, 0, 5, tzinfo=NetworkNEM.get_fixed_offset()) # 48 trading intervals in the day # (could be better with groupby function) for TI in range(48): # t_i initial timestamp of trading_interval, t_f = final timestamp of trading interval t_i = t_start + timedelta(0, 1800 * TI) t_f = t_start + timedelta(0, 1800 * (TI + 1)) _query = f"'{t_i}' <= trading_interval <= '{t_f}' and facility_code == '{duid_id}'" d_ti = df.query(_query) energy_value = None trading_interval = None # rooftop 30m intervals - AEMO rooftop is going to go in a separate network # so this won't be required if (d_ti.fueltech_id.all() == "solar_rooftop") and (d_ti[power_field].count() == 1): energy_value = d_ti[power_field].sum() / 2 # ooofff - this delta comes back off as part of NEM offset trading_interval = d_ti.index[0] + timedelta(minutes=5) # interpolate if it isn't padded out elif d_ti[power_field].count() != 7: index_interpolated = pd.date_range(start=t_i, end=t_f, freq="5min", tz=NetworkNEM.get_timezone()) d_ti = d_ti.reset_index() d_ti = d_ti.set_index("trading_interval") d_ti = d_ti.reindex(index_interpolated) d_ti["facility_code"] = duid_id d_ti[power_field] = d_ti[power_field].replace(np.NaN, 0) if d_ti[power_field].count() != 7: logger.warn("Interpolated frame didn't match generated count") try: if d_ti.fueltech_id.all() != "solar_rooftop": energy_value = __trapezium_integration(d_ti, power_field) trading_interval = d_ti.index[-2] except ValueError as e: logger.error("Error with {} at {} {}: {}".format( duid_id, t_i, t_f, e)) if not d_ti.index.empty: return_cols.append({ "trading_interval": trading_interval, "network_id": "NEM", "facility_code": duid_id, "eoi_quantity": energy_value, }) return return_cols
def get_normalized_policy_shifts_and_current_policy_all_countries( policy_data_countries: pd.DataFrame, past_parameters: pd.DataFrame) -> (dict, dict): """ Computes the normalized policy shifts and the current policy in each area of the world except the US (done in a separate function) :param policy_data_countries: processed dataframe with the MECE policies implemented per area for every day :param past_parameters: past parameters file used for policy shift generation (specifically computation of gamma(t) values in the process :return: a tuple of two dictionaries, {policy: normalized_shift_float_international} and {area: current_policy} """ dict_current_policy = {} policy_list = future_policies policy_data_countries["country_cl"] = policy_data_countries[ "country"].apply(lambda x: x.replace(",", "").strip().lower()) past_parameters_copy = deepcopy(past_parameters) past_parameters_copy["Country"] = past_parameters_copy["Country"].apply( lambda x: str(x).replace(",", "").strip().lower()) params_countries = past_parameters_copy["Country"] params_countries = set(params_countries) policy_data_countries_bis = policy_data_countries.query( "country_cl in @params_countries") countries_upper_set = set(policy_data_countries[ policy_data_countries.country != "US"]["country"]) # countries_in_oxford_and_params = params_countries.intersection(countries_upper_set) for country in countries_upper_set: dict_current_policy[(country, "None")] = list( compress( policy_list, (policy_data_countries.query("country == @country") [policy_data_countries.query("country == @country")["date"] == policy_data_countries.query("country == @country").date.max( )][policy_list] == 1).values.flatten().tolist(), ))[0] countries_common = sorted([x.lower() for x in countries_upper_set]) pastparam_tuples_in_oxford = past_parameters_copy[ (past_parameters_copy.Country.isin(countries_common)) & (past_parameters_copy.Province != "None")].reset_index(drop=True) pastparam_tuples_in_oxford["tuple_name"] = list( zip(pastparam_tuples_in_oxford.Country, pastparam_tuples_in_oxford.Province)) for tuple in pastparam_tuples_in_oxford.tuple_name.unique(): country, province = tuple country = country[0].upper() + country[1:] dict_current_policy[(country, province)] = dict_current_policy[(country, "None")] countries_set = set(policy_data_countries["country_cl"]) params_dic = {} countries_set = countries_set.intersection(params_countries) for country in countries_set: params_dic[country] = past_parameters_copy.query( "Country == @country")[[ "Data Start Date", "Median Day of Action", "Rate of Action" ]].iloc[0] policy_data_countries_bis["Gamma"] = [ gamma_t(day, country, params_dic) for day, country in zip(policy_data_countries_bis["date"], policy_data_countries_bis["country_cl"]) ] n_measures = policy_data_countries_bis.iloc[:, 3:-2].shape[1] dict_normalized_policy_gamma = { policy_data_countries_bis.columns[3 + i]: policy_data_countries_bis[ policy_data_countries_bis.iloc[:, 3 + i] == 1].iloc[:, -1].mean() for i in range(n_measures) } normalize_val = dict_normalized_policy_gamma[policy_list[0]] for policy in dict_normalized_policy_gamma.keys(): dict_normalized_policy_gamma[policy] = ( dict_normalized_policy_gamma[policy] / normalize_val) return dict_normalized_policy_gamma, dict_current_policy
def remove_forfeits(df: pd.DataFrame) -> pd.DataFrame: return df.query("score != 'Forfeit'")
def _split_ber_by_year_of_construction( df: pd.DataFrame, condition: str, ) -> pd.DataFrame: return df.query(condition).drop_duplicates()
db = client['spam_database'] collection = db.spam_clean2 dataframe_spam_clean = DataFrame(list(collection.find())) # In[3]: len(dataframe_spam_clean) # In[4]: len(dataframe_ham_clean) # In[5]: dataframe_spam_clean.query('ContentType_body == ["text/html"]')[[ 'Subject', 'ContentType_body', 'body', 'body_text_normalize' ]] # In[6]: from pandas import DataFrame from pymongo import MongoClient client = MongoClient('mongodb://192.168.67.90:27017') client.database_names() db = client['spam_database'] collection = db.spam_clean dataframe_spam_clean = DataFrame(list(collection.find())) # In[7]:
def test_query_with_unnamed_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) a = np.random.choice(["red", "green"], size=10) b = np.random.choice(["eggs", "ham"], size=10) index = MultiIndex.from_arrays([a, b]) df = DataFrame(np.random.randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) exp = df[ind == "red"] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) exp = df[ind != "red"] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) exp = df[ind.isin(["red"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) exp = df[~ind.isin(["red"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) exp = df[ind.isin(["red"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) exp = df[~ind.isin(["red"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # ## LEVEL 1 ind = Series(df.index.get_level_values(1).values, index=index) res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) exp = df[ind == "eggs"] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) exp = df[ind != "eggs"] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) exp = df[ind.isin(["eggs"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) exp = df[~ind.isin(["eggs"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) exp = df[ind.isin(["eggs"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp) res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) exp = df[~ind.isin(["eggs"])] tm.assert_frame_equal(res1, exp) tm.assert_frame_equal(res2, exp)
def get_context_data(self, **kwargs): context = super(StayDetail, self).get_context_data(**kwargs) stays = self.stays hotel_1_id = int(kwargs['hotel_1_id']) hotel_2_id = int(kwargs.get('hotel_2_id', 0)) check_in_2 = self.kwargs.get('check_in_2') query = 'hotel_1_id == @hotel_1_id' max_switch_count = stays['switch_count'].max() if max_switch_count > 0: # pragma: no cover query = query + \ '& ((hotel_2_id == @hotel_2_id & check_in_2 == @check_in_2) \ | (@hotel_2_id == 0 & switch_count == 0))' stay = stays.query(query) check_out_1 = check_in_2 = datetime.strptime( stay['check_out_1'].values[0], '%Y-%m-%d') stay = stay.to_dict('records')[0] facilities = HotelbedsFacility.objects.all().iterator() facilities = [{ 'code': facility.code, 'group': facility.group, 'description': facility.description, } for facility in facilities] facilities = DataFrame(facilities) hotel = Hotel.objects.get(hotel_id=hotel_1_id) hotels = [hotel] try: # pragma: no cover hotel_facilities = DataFrame(hotel.facilities) hotel_facilities.query('available == True', inplace=True) hotel_facilities = merge(hotel_facilities, facilities, on=['code', 'group']) hotel_facility_lists = [hotel_facilities.to_dict('records')] except Exception: hotel_facility_lists = [] rooms = [ HotelbedsRoom.objects.get(code=stay['room_type_1']).description ] boards = [HotelbedsBoard.objects.get(code=stay['board_1']).description] if hotel_2_id > 0: # pragma: no cover hotel = Hotel.objects.get(hotel_id=hotel_2_id) hotels.append(hotel) try: # pragma: no cover hotel_facilities = DataFrame(hotel.facilities) hotel_facilities.query('available == True', inplace=True) hotel_facilities = merge(hotel_facilities, facilities, on=['code', 'group']) hotel_facility_lists.append( hotel_facilities.to_dict('records')) except Exception: pass rooms.append( HotelbedsRoom.objects.get( code=stay['room_type_2']).description) boards.append( HotelbedsBoard.objects.get(code=stay['board_2']).description) # Stored separately for passing to JS galleria_images = self.parse_hotel_images(hotels) try: # pragma: no cover tripadvisor_reviews = [ tripadvisor.get_tripadvisor_review( hotel.tripadvisor.tripadvisor) for hotel in hotels ] except AttributeError: tripadvisor_reviews = [] facilities = {} for idx, hotel in enumerate(hotels): lst = [] try: lst = hotel_facility_lists[idx] except IndexError: pass facilities[hotel.hotel_id] = lst context.update({ 'stay': stay, 'hotels': hotels, 'galleria_images': galleria_images, 'rooms': rooms, 'boards': boards, 'facilities': facilities, 'check_out_1': check_out_1, 'check_in_2': check_in_2, 'tripadvisor_reviews': tripadvisor_reviews, 'blocked_countries': settings.BLOCKED_COUNTRIES }) return context
def _extract_process_step_statistics(self, process_steps: pd.DataFrame, machine_utilization: pd.DataFrame, data_transfers: pd.DataFrame) -> pd.DataFrame: data: pd.DataFrame = pd.DataFrame({PROCESS_STEP_NAME: [], PROCESS_STEP_START: [], PROCESS_STEP_END: [], PROCESS_STEP_ABS_DUR: [], PROCESS_STEP_REL_DUR: [], STEP_STATS_SUM_ABS_DURATION: [], STEP_STATS_SUM_REL_DURATION: [], STEP_STATS_AVG_CPU: [], STEP_STATS_AVG_ABS_MEM: [], STEP_STATS_AVG_REL_MEM: [], STEP_STATS_MAX_ABS_MEM: [], STEP_STATS_MAX_REL_MEM: [], STEP_STATS_TOT_ABS_IDLE_TIME: [], STEP_STATS_TOT_REL_IDLE_TIME: [], STEP_STATS_AVG_ABS_IDLE_TIME: [], STEP_STATS_AVG_REL_IDLE_TIME: [], STEP_STATS_MAX_ABS_IDLE_TIME: [], STEP_STATS_MAX_REL_IDLE_TIME: [], STEP_STATS_TOT_ABS_BUSY_TIME: [], STEP_STATS_TOT_REL_BUSY_TIME: [], STEP_STATS_AVG_ABS_BUSY_TIME: [], STEP_STATS_AVG_REL_BUSY_TIME: [], STEP_STATS_MAX_ABS_BUSY_TIME: [], STEP_STATS_MAX_REL_BUSY_TIME: []}) processor_step_stats: pd.DataFrame = pd.DataFrame({PROCESS_STEP_NAME: [], PROCESS_STEP_START: [], PROCESS_STEP_END: [], PROCESS_STEP_ABS_DUR: [], PROCESS_STEP_REL_DUR: [], PROCESSOR_STEP_STATS_PROC_NAME: [], PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME: [], PROCESSOR_STEP_STATS_TOT_REL_IDLE_TIME: [], PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME: [], PROCESSOR_STEP_STATS_TOT_REL_BUSY_TIME: [], PROCESSOR_STEP_STATS_MAX_ABS_MEM: [], PROCESSOR_STEP_STATS_MAX_REL_MEM: []}) num_processors: int = len(self._events) for i in range(process_steps.shape[0]): step_name: str = process_steps.at[i, PROCESS_STEP_NAME] start: float = process_steps.at[i, PROCESS_STEP_START] end: float = process_steps.at[i, PROCESS_STEP_END] duration: float = process_steps.at[i, PROCESS_STEP_ABS_DUR] summed_duration: float = duration * num_processors step_machine_utilization: pd.DataFrame = machine_utilization.query(f"{MACHINE_UTILIZATION_TIME} >= {start} & {MACHINE_UTILIZATION_TIME} <= {end}") # step_data_transfer: pd.DataFrame = data_transfers.query(f"{DATA_TRANSFER_START} <= {end} & {DATA_TRANSFER_END} >= {start}") machine_avg: pd.DataFrame = step_machine_utilization.mean() machine_max: pd.DataFrame = step_machine_utilization.max() processor_step_stats = self._processor_stats_in(process_steps.iloc[i], processor_step_stats) p_stats: pd.DataFrame = processor_step_stats.query(f"{PROCESS_STEP_NAME} == '{step_name}'") p_stats_sum: pd.DataFrame = p_stats.sum() p_stats_max: pd.DataFrame = p_stats.max() data = data.append({PROCESS_STEP_NAME: step_name, PROCESS_STEP_START: start, PROCESS_STEP_END: end, PROCESS_STEP_ABS_DUR: duration, PROCESS_STEP_REL_DUR: process_steps.at[i, PROCESS_STEP_REL_DUR], STEP_STATS_SUM_ABS_DURATION: summed_duration, STEP_STATS_SUM_REL_DURATION: summed_duration / (self._process_duration * num_processors), STEP_STATS_AVG_CPU: machine_avg[MACHINE_UTILIZATION_AVG_CPU], STEP_STATS_AVG_ABS_MEM: machine_avg[MACHINE_UTILIZATION_AVG_ABS_MEM], STEP_STATS_AVG_REL_MEM: machine_avg[MACHINE_UTILIZATION_AVG_REL_MEM], STEP_STATS_MAX_ABS_MEM: machine_max[MACHINE_UTILIZATION_MAX_ABS_MEM], STEP_STATS_MAX_REL_MEM: machine_max[MACHINE_UTILIZATION_MAX_REL_MEM], STEP_STATS_TOT_ABS_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME], STEP_STATS_TOT_REL_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / summed_duration, STEP_STATS_AVG_ABS_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / num_processors, STEP_STATS_AVG_REL_IDLE_TIME: (p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / num_processors) / duration, STEP_STATS_MAX_ABS_IDLE_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME], STEP_STATS_MAX_REL_IDLE_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / duration, STEP_STATS_TOT_ABS_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME], STEP_STATS_TOT_REL_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / summed_duration, STEP_STATS_AVG_ABS_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / num_processors, STEP_STATS_AVG_REL_BUSY_TIME: (p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / num_processors) / duration, STEP_STATS_MAX_ABS_BUSY_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME], STEP_STATS_MAX_REL_BUSY_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / duration}, ignore_index=True) data.to_csv("process-step-statistics.csv", sep=";", index_label="index") processor_step_stats.to_csv("processor-step-statistics.csv", sep=";", index_label="index") return data
def report( df: pd.DataFrame, id_col: str = "Compound_Id", columns: List[str] = ["Compound_Id", "Smiles"], title: str = "Cluster Report", intro: str = "Large clusters first, similar clusters together.", ): """Write a HTML report. `Cluster_No` and `IsRepr` have to be present in the DataFrame. In the current setting, the largest clusters are at the top of the report, with similar clusters (determind by the chemical similarities of the representative structures) are grouped together. Writes the report to disk as `Clusters.html`. Used in `projects/paint3_anal/190328_cpd_clustering.ipynb`. Arguments: df: The input DataFrame containing the structures as Smiles. id_col: The name of the column to use for identity. Default is `Compound_Id`. columns: List of columns to include. title: The report title. intro: Some text used for introduction of the report. """ def add_cluster(cl_no, sim_to=None): if sim_to is None: sim_to = "" html.append("<hr>") else: sim_to = f"(similar to {sim_to})" mf_cl = mf.MolFrame(df.query("Cluster_No == @cl_no")[columns]) mf_cl = mf_cl.add_mols() html.append( f"<br><h2>Cluster {cl_no} ({len(mf_cl.data)} Members) {sim_to}</h2><br>" ) grid = mfv.html_grid(mf_cl.data, id_col="Compound_Id") html.append(grid) if id_col not in columns: columns = [id_col] + columns if "Smiles" not in columns: columns.append("Smiles") df_repr = df.query("IsRepr == 'Yes'").reset_index().drop("index", axis=1) chem_sim = {} for idx, rec0 in df_repr.iterrows(): for _, rec1 in df_repr.iloc[idx + 1:].iterrows(): cl0 = rec0["Cluster_No"] cl1 = rec1["Cluster_No"] sim = mf.chem_sim(rec0["Smiles"], rec1["Smiles"]) chem_sim[(cl0, cl1)] = sim chem_sim[(cl1, cl0)] = sim cl_sizes = (df[["Cluster_No", "Compound_Id" ]].groupby(by="Cluster_No").count().reset_index().rename( columns={"Compound_Id": "Size"})) cl_sizes = cl_sizes.sort_values("Size", ascending=False) cl_order = {x: True for x in cl_sizes["Cluster_No"].values} html = [f"<h1>{title}</h1><br>{intro}<br><br>"] while len(cl_order) > 0: cl_no = list(cl_order.keys())[0] add_cluster(cl_no) cl_order.pop(cl_no) to_remove = [] for sim_cl in cl_order: if chem_sim[(cl_no, sim_cl)] > 0.45: add_cluster(sim_cl, cl_no) to_remove.append(sim_cl) for x in to_remove: cl_order.pop(x) mfht.write(mfht.page("\n".join(html)), "Clusters.html")
def generate_causal_graph(place_change_events: DataFrame, transition_events: DataFrame, time_per_step: float): g = nx.DiGraph( ) # Nodes are occasions and edges leading in their prehensions # Add the initial state for each node as an occasion with no past initial_occasions = place_change_events.query('tstep == 0') for occ in initial_occasions.itertuples(): g.add_node(Occasion(int(occ.num), occ.name, occ.time)) # unit, state, time # Visit each transition and identify i) its output node and its 2 input nodes for trans in transition_events.itertuples(): # row has: tstep, time, name, unit, neighbour & count # TODO: IS IT SAFE TO IGNORE THIS? # assert trans.count == 1 # Statistically likely to happen as simulations get more complex or are undersampled. Consider what to do if this occurs --Rob # Create new occasion in graph for this transition # output_state = trans.name[1] # ab -> b prefix, input_state, output_state = expand_transition_name( trans.name) # strings if math.isnan(trans.unit): print(f"*** {trans.unit} {output_state} {trans.time}") continue output_occasion = Occasion(int(trans.unit), output_state, trans.time) g.add_node(output_occasion) def choose_best_upstream_occasion(target_unit, target_state_name, source_time): query = f"num=={target_unit} & name=='{target_state_name}' & time<{source_time}" last_transition_time = place_change_events.query( query)['time'].max() if math.isnan(last_transition_time): # Try including the source time query = f"num=={target_unit} & name=='{target_state_name}' & time=={source_time}" last_transition_time = place_change_events.query( query)['time'].min() if math.isnan(last_transition_time): # Try including the step after query = f"num=={target_unit} & name=='{target_state_name}' & time<={source_time + time_per_step}" last_transition_time = place_change_events.query( query)['time'].min() return Occasion(target_unit, target_state_name, last_transition_time) # Determine local input node from same unit # state_name = trans.name[0] # ab -> a local_input_occasion = choose_best_upstream_occasion( trans.unit, input_state, trans.time) g.add_edge(local_input_occasion, output_occasion) # Determine input node from neighbour # state_name = trans.name[1] # ab -> b neighbour_input_occasion = choose_best_upstream_occasion( trans.neighbour, output_state, trans.time) g.add_edge(neighbour_input_occasion, output_occasion) # Determine input node from neighbour2 if set if not math.isnan(trans.neighbour2): # state_name = trans.name[1] # ab -> b # neighbour2 assumed pulling state forward (like neighbour) neighbour2_input_occasion = choose_best_upstream_occasion( trans.neighbour2, output_state, trans.time) g.add_edge(neighbour2_input_occasion, output_occasion) return g
def get_vinbigdata_dicts( imgdir: Path, train_df: pd.DataFrame, train_data_type: str = "original", use_cache: bool = True, debug: bool = True, target_indices: Optional[np.ndarray] = None, ): debug_str = f"_debug{int(debug)}" train_data_type_str = f"_{train_data_type}" cache_path = Path( ".") / f"dataset_dicts_cache{train_data_type_str}{debug_str}.pkl" if not use_cache or not cache_path.exists(): print("Creating data...") train_meta = pd.read_csv(imgdir / "train_meta.csv") if debug: train_meta = train_meta.iloc[:500] # For debug.... # Load 1 image to get image size. image_id = train_meta.loc[0, "image_id"] image_path = str(imgdir / "train" / f"{image_id}.png") image = cv2.imread(image_path) resized_height, resized_width, ch = image.shape print(f"image shape: {image.shape}") dataset_dicts = [] for index, train_meta_row in tqdm(train_meta.iterrows(), total=len(train_meta)): record = {} image_id, height, width = train_meta_row.values filename = str(imgdir / "train" / f"{image_id}.png") record["file_name"] = filename record["image_id"] = image_id record["height"] = resized_height record["width"] = resized_width objs = [] for index2, row in train_df.query( "image_id == @image_id").iterrows(): # print(row) # print(row["class_name"]) # class_name = row["class_name"] class_id = row["class_id"] if class_id == 14: # It is "No finding" # This annotator does not find anything, skip. pass else: # bbox_original = [int(row["x_min"]), int(row["y_min"]), int(row["x_max"]), int(row["y_max"])] h_ratio = resized_height / height w_ratio = resized_width / width bbox_resized = [ int(row["x_min"]) * w_ratio, int(row["y_min"]) * h_ratio, int(row["x_max"]) * w_ratio, int(row["y_max"]) * h_ratio, ] obj = { "bbox": bbox_resized, "bbox_mode": BoxMode.XYXY_ABS, "category_id": class_id, } objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) with open(cache_path, mode="wb") as f: pickle.dump(dataset_dicts, f) print(f"Load from cache {cache_path}") with open(cache_path, mode="rb") as f: dataset_dicts = pickle.load(f) if target_indices is not None: dataset_dicts = [dataset_dicts[i] for i in target_indices] return dataset_dicts
def parse_raw_zooniverse_file( raw_zooniverse_classifications: pd.DataFrame) -> pd.DataFrame: filtered_raw_zooniverse = raw_zooniverse_classifications.query( 'workflow_name == "Transcribe Words" and workflow_version == 3.7' ).copy() def clean_text_values(txt: str): txt = txt.replace('null', 'None') txt = ast.literal_eval(txt) if type(txt) is dict: # for subject_data txt = [*txt.values()] txt = txt[0] return txt filtered_raw_zooniverse.loc[:, 'annotations'] = filtered_raw_zooniverse[ 'annotations'].apply(clean_text_values) filtered_raw_zooniverse.loc[:, 'subject_data'] = filtered_raw_zooniverse[ 'subject_data'].apply(clean_text_values) parsed_zooniverse_classifications = pd.DataFrame() parsed_zooniverse_classifications['id'] = filtered_raw_zooniverse[ 'subject_data'].apply( lambda annotation: annotation['image_of_boxed_letter'].replace( 'wordbox-', '').replace('.jpg', '').replace('label-', '')) def parse_subject(s): barcode = s['barcode'].split('-')[ 0] # in case the file name includes "-label" image_name = s['image_of_boxed_letter'] col_names = [ 'barcode', 'block', 'paragraph', 'word', 'gcv_identification', 'image_location' ] result = pd.Series([ barcode, int(s['block_no']), int(s['paragraph_no']), int(s['word_no']), s['#GCV_identification'], image_name ], index=col_names) return result parsed_subjects = filtered_raw_zooniverse['subject_data'].apply( parse_subject) parsed_zooniverse_classifications = pd.concat( [parsed_zooniverse_classifications, parsed_subjects], axis=1) parsed_zooniverse_classifications['handwritten'] = filtered_raw_zooniverse[ 'annotations'].apply( lambda annotation: annotation[0]['value'] == 'handwritten') parsed_zooniverse_classifications[ 'human_transcription'] = filtered_raw_zooniverse['annotations'].apply( lambda annotation: annotation[1]['value']) parsed_zooniverse_classifications[ 'unclear'] = parsed_zooniverse_classifications[ 'human_transcription'].apply( lambda transcription: '[unclear]' in transcription and '[/unclear]' in transcription) parsed_zooniverse_classifications['human_transcription'] = \ parsed_zooniverse_classifications['human_transcription'] \ .apply(lambda transcription: transcription.replace('[unclear][/unclear]', '')) parsed_zooniverse_classifications[ 'seen_count'] = parsed_zooniverse_classifications.groupby( 'id')['block'].transform(len) parsed_zooniverse_classifications['confidence'] = 1.0 parsed_zooniverse_classifications['status'] = 'In Progress' return parsed_zooniverse_classifications
def test_query_syntax_error(self): engine, parser = self.engine, self.parser df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)}) msg = "invalid syntax" with pytest.raises(SyntaxError, match=msg): df.query("i - +", engine=engine, parser=parser)
from pandas import DataFrame from Data import grade_dic from print_df import print_df # 데이터 프레임 df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽']) # 다중 조건 행 조회 # or 조건 all_index = df.query('국어 < 50 or 영어 < 40') # 출력 print_df(all_index)
def preprocess_data(X: pd.DataFrame, add_province_columns: bool = False, drop_first_day: bool = False) -> pd.DataFrame: """ Preprocess data to be used in StemPoissonRegressor by adding columns for previous day information and also adding columns for all regions as predictors Args: X (pd.DataFrame): Dataframe with columns province, date, active_cases, percent_susceptible add_province_columns (bool, optional): If variables active_cases and percent_susceptible should be added as columns for each province. Defaults to False. drop_first_day (bool, optional): Whether to drop first day of each province or not. Defaults to False. Returns: pd.DataFrame: Preprocess dataframe with columns for all provinces """ # Add columns with log transformation X = X.assign( log_active_cases=lambda x: np.log(x["active_cases"] + 1), log_percent_susceptible=lambda x: np.log(x["percent_susceptible"]), ) # Add columns for previous day information previous_day = (X.groupby("province").shift(periods=1, axis=0).loc[:, [ "active_cases", "percent_susceptible", "log_active_cases", "log_percent_susceptible", ], ]) X = X.assign( active_cases_yesterday=previous_day["active_cases"], percent_susceptible_yesterday=previous_day["percent_susceptible"], log_active_cases_yesterday=previous_day["log_active_cases"], log_percent_susceptible_yesterday=previous_day[ "log_percent_susceptible"], ) # Add previous day columns for each province X_new = X.copy() if add_province_columns: provinces = X_new["province"].unique() for province in provinces: # Get province data rows and duplicate n times for concat column wise prov_data = X.query("province == @province").loc[:, [ "active_cases_yesterday", "percent_susceptible_yesterday", "log_active_cases_yesterday", "log_percent_susceptible_yesterday", "active_cases", "percent_susceptible", "log_active_cases", "log_percent_susceptible", ], ] prov_data = pd.concat([prov_data] * len(provinces), ignore_index=True) # Append name of province to each column name for col in prov_data.columns: prov_data.rename(columns={col: f"{province}_{col}"}, inplace=True) X_new = pd.concat([X_new, prov_data], axis=1) # Drop first days missing t-1 information if drop_first_day: X_new = X_new.query("active_cases_yesterday == active_cases_yesterday") X_new.reset_index(drop=True, inplace=True) return X_new
def normalize(self, df: pd.DataFrame) -> pd.DataFrame: bads = [r",", r"%", "nan"] str_cols = ["County", "FileNumber", "ProviderName"] df = self._clean_cols(df, bads, str_cols) df["location_name"] = df["County"].str.title() # Create new columns df["ICU Census"] = df["Adult ICU Census"] + df["Pediatric ICU Census"] df["ICU Capacity"] = (df["Total AdultICU Capacity"] + df["Total PediatricICU Capacity"]) df["Available ICU"] = df["Available Adult ICU"] + df[ "Available Pediatric ICU"] # Rename appropriate columns crename = { "Adult ICU Census": CMU(category="adult_icu_beds_in_use", measurement="current", unit="beds"), "Available Adult ICU": CMU(category="adult_icu_beds_available", measurement="current", unit="beds"), "Total AdultICU Capacity": CMU(category="adult_icu_beds_capacity", measurement="current", unit="beds"), "Pediatric ICU Census": CMU(category="pediatric_icu_beds_in_use", measurement="current", unit="beds"), "Available Pediatric ICU": CMU( category="pediatric_icu_beds_available", measurement="current", unit="beds", ), "Total PediatricICU Capacity": CMU( category="pediatric_icu_beds_capacity", measurement="current", unit="beds", ), "ICU Census": CMU(category="icu_beds_in_use", measurement="current", unit="beds"), "ICU Capacity": CMU(category="icu_beds_capacity", measurement="current", unit="beds"), "Available ICU": CMU(category="icu_beds_available", measurement="current", unit="beds"), } # Drop grand total and melt out = (df.query("location_name != 'Grand Total'").melt( id_vars=["location_name"], value_vars=crename.keys()).dropna()) out["value"] = pd.to_numeric(out["value"]) out = out.groupby(["location_name", "variable"]).sum().reset_index() out.loc[out["location_name"] == "Desoto", "location_name"] = "DeSoto" # Extract category information and add other context out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Eastern") out["vintage"] = self._retrieve_vintage() self.clean_desoto(out) return out.loc[:, self.out_cols]
def py_query( data: pd.DataFrame, query, *, use_pd_query=False, allow_empty_result=False, setup_code='', globals=None, return_selected_data=True, ): """ Alternative: pd.DataFrame.query: supports a subset of this function, but is faster >>> df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) >>> df a b 0 1 2 1 3 4 >>> py_query(df, 'a == 1') a b 0 1 2 >>> py_query(df, 'a == 1', use_pd_query=True) a b 0 1 2 >>> py_query(df, 'int(a) == 1') a b 0 1 2 >>> py_query(df, ['int(a) == 1', 'b == 2']) a b 0 1 2 >>> py_query(df, ['index == 1']) # get second row a b 1 3 4 >>> py_query(df, ['index == 1'], use_pd_query=True) a b 1 3 4 To access column names that aren't valid python identifiers (e.g. the name contains a whitespace), you have to use the kwargs dictionary: >>> df = pd.DataFrame([{'a b': 1, 'b': 2}, {'a b': 3, 'b': 4}]) >>> py_query(df, 'kwargs["a b"] == 1') a b b 0 1 2 When you need a package function, you have to specify it in the globals dict. e.g.: >>> import numpy as np >>> df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) >>> py_query(df, 'np.equal(a, 1)', globals={'np': np}) a b 0 1 2 Args: data: pandas.DataFrame query: str or list of str. If list of str the strings get join by a logical `and` to be a str. For examples see doctest. Note: Use index to get access to the index. use_pd_query: Pandas query is much faster but limited allow_empty_result: setup_code: legacy argument, Superseded by the globals argument. Additional code which runs before the query conditions. You may use this for additional imports. globals: Specify some global names. Useful for imports. See doctest how to use it. return_selected_data: Whether to return the selection of the data or the selection indices. Returns: data[selection] if not return_selection else selection """ if query is False: return data if query in [[], tuple(), '']: return data if isinstance(query, (list, tuple)): if len(query) == 1: query, = query else: query = ') and ('.join(query) query = f'({query})' else: assert isinstance(query, str) if use_pd_query is True: return data.query(query) elif use_pd_query == 'try': try: return data.query(query) except Exception: pass else: assert use_pd_query is False, use_pd_query keywords = ['index'] + list(data) def is_valid_variable_name(name): import ast # https://stackoverflow.com/a/36331242/5766934 try: ast.parse('{} = None'.format(name)) return True except (SyntaxError, ValueError, TypeError): return False keywords = [k for k in keywords if is_valid_variable_name(k)] + ['**kwargs'] d = {} code = f""" def func({', '.join(keywords)}): {setup_code} try: return {query} except Exception: raise Exception('See above error message. Locals are:', locals()) """ if globals is None: globals = {} else: globals = globals.copy() try: exec(code, globals, d) func = d['func'] except Exception as e: raise Exception(code) from e selection = data.apply(lambda row: func(row.name, **row), axis=1) assert allow_empty_result or len(selection) > 0, len(selection) if return_selected_data: return data[selection] else: return selection
def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) expec = df[df.a == "test & test"] tm.assert_frame_equal(res, expec)
def fit( self, X: pd.DataFrame, Y: pd.DataFrame, ): """ Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model for removed using active_cases at time t-1. Args: X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, province, active_cases, percent_susceptible, and all columns for provinces for {province_name}_active_cases_yesterday, {province_name}_percent_susceptible_yesterday, as well as all log features Y (pd.DataFrame): Dataframe for given region of response variables containing columns date, province, cases, removed """ self.X_original = X.copy() self.Y_original = Y.copy() self.provinces = X["province"].unique() # Fit model for each province self.X_cases = {} self.Y_cases = {} self.X_removed = {} self.Y_removed = {} self.poisson_gam_cases = {} self.poisson_gam_removed = {} for province in self.provinces: # Remove extra columns for given province in form {province}_column_name cols_drop = X.filter(regex=province, axis=1).columns X_province = X.query(f"province == '{province}'").drop(cols_drop, axis=1) Y_province = Y.query(f"province == '{province}'") # Store case dataframe used to train model for each province self.X_cases[province] = X_province.filter( regex= r"(log_active_cases_yesterday|log_percent_susceptible_yesterday)" ) self.Y_cases[province] = Y_province["cases"] # Add terms for each province I_t-1 and Z_t-1. Either splines or linear terms if self.use_splines: terms = s(0, lam=self.lam_main) + s(1, lam=self.lam_main) for i in range(1, len(self.provinces)): terms += s(i * 2, lam=self.lam_other) + s( i * 2 + 1, lam=self.lam_other) else: terms = l(0, lam=self.lam_main) + l(1, lam=self.lam_other) for i in range(1, len(self.provinces)): terms += l(i * 2, lam=self.lam_other) + l( i * 2 + 1, lam=self.lam_other) # Fit cases model for province cases_model = PoissonGAM(terms, verbose=self.verbose) cases_model.fit(self.X_cases[province], self.Y_cases[province]) self.poisson_gam_cases[province] = cases_model # Store remove dataframe used to train model for each province self.X_removed[province] = X_province.filter( regex=r"log_active_cases_yesterday") self.Y_removed[province] = Y_province["removed"] # Add terms for each province I_t-1 terms = l(0, lam=self.lam_main) for i in range(1, len(self.provinces)): terms += l(i, lam=self.lam_other) # Fit removed model for each province removed_model = PoissonGAM(terms, verbose=self.verbose) removed_model.fit(self.X_removed[province], self.Y_cases[province]) self.poisson_gam_removed[province] = removed_model return
def fit( self, X: pd.DataFrame, Y: pd.DataFrame, ): """ Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model for removed using active_cases at time t-1. Args: X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed """ # Remove days in data that are after the latest twitter data given if self.twitter_data is not None: remove_date = self.twitter_data["date"].max() else: remove_date = X["date"].max() X = X.query("date <= @remove_date") Y = Y.query("date <= @remove_date") self.X_original = X.copy() self.Y_original = Y.copy() # Separate data for each model self.X_cases = X[[ "date", "log_active_cases_yesterday", "log_percent_susceptible_yesterday" ]].copy() self.Y_cases = Y["cases"] self.X_removed = X[["date", "log_active_cases_yesterday"]].copy() self.Y_removed = Y["removed"] # Preprocess twitter data by shifting it by twitter_offset days so each row contains the twitter data from twitter_offset days ago if self.twitter_data is not None: twitter_shifted = self.twitter_data.drop( ["date", "province"], axis=1).shift(periods=self.twitter_offset, axis=0) twitter_shifted.columns = [ f"{col}_shifted" for col in twitter_shifted.columns ] twitter_shifted = twitter_shifted.assign( date=self.twitter_data["date"]) # Add twitter data to use in both cases and removed models self.X_cases = self.X_cases.merge(twitter_shifted, how="left", on=["date"]) self.X_removed = self.X_removed.merge(twitter_shifted, how="left", on=["date"]) # Drop date columns not used anymore self.X_cases = self.X_cases.drop("date", axis=1) self.X_removed = self.X_removed.drop("date", axis=1) # Setup terms for covid19 data to use in GLM term = s if self.use_spline else l terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam) terms_removed = term(0, lam=self.lam) # Add terms for twitter data twitter_cols = self.twitter_data.columns.drop(["date", "province"]) for i in range(0, len(twitter_cols)): terms_cases = terms_cases + term(i + 2, lam=self.lam) terms_removed = terms_removed + term(i + 1, lam=self.lam) # Model new cases data using infections and percentage susceptible at time t-1 self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose) self.poisson_gam_cases.fit(self.X_cases, self.Y_cases) # Model removed cases using infections at time t-1 self.poisson_gam_removed = PoissonGAM(terms_removed, verbose=self.verbose) self.poisson_gam_removed.fit(self.X_removed, self.Y_removed) return
def data(ws, mongodb, slug): if not ws: abort(400, 'Expected WebSocket request.') DW = DataWarehouse() element = mongodb['element'].find_one({'slug': slug}) element['page_limit'] = 50 if request.GET.get('limit', True) is False: element['page_limit'] = 9999999999 data = DW.get(element.get('cube')) columns = data.get('columns') or [] fields = columns if request.GET.get('fields', None): fields = request.GET.get('fields').split(',') cube_last_update = mongodb['cube'].find_one({'slug': element.get('cube')}) ws.send(json.dumps({'type': 'last_update', 'data': str(cube_last_update.get('lastupdate', ''))})) ws.send(json.dumps({'type': 'columns', 'data': fields})) filters = [i[0] for i in request.GET.iteritems() if len(i[0].split('filter__')) > 1] if element['type'] == 'grid': page = int(request.GET.get('page', 1)) page_start = 0 page_end = element['page_limit'] if page >= 2: page_end = element['page_limit'] * page page_start = page_end - element['page_limit'] else: page_start = None page_end = None df = DataFrame(data.get('data') or {}, columns=fields) if len(filters) >= 1: for f in filters: s = f.split('__') field = s[1] operator = s[2] value = request.GET.get(f) if operator == 'like': df = df[df[field].str.contains(value)] elif operator == 'regex': df = DataFrameSearchColumn(df, field, value, operator) else: df = df.query(df_generate(df, value, f)) groupby = [] if request.GET.get('groupby', None): groupby = request.GET.get('groupby', ).split(',') if len(groupby) >= 1: df = DataFrame(df.groupby(groupby).grouper.get_group_levels()) if request.GET.get('orderby', element.get('orderby', None)) and request.GET.get( 'orderby', element.get('orderby', None)) in fields: orderby = request.GET.get('orderby', element.get('orderby', '')) if type(orderby) == str: orderby = orderby.split(',') orderby__order = request.GET.get('orderby__order', element.get('orderby__order', '')) if type(orderby__order) == str: orderby__order = orderby__order.split(',') ind = 0 for orde in orderby__order: if orde == '0': orderby__order[ind] = False else: orderby__order[ind] = True ind += 1 df = df.sort(orderby, ascending=orderby__order) ws.send(json.dumps({'type': 'max_page', 'data': len(df)})) # CLEAN MEMORY del filters, fields, columns gc.collect() categories = [] for i in df.to_dict(outtype='records')[page_start:page_end]: if element.get('categories', None): categories.append(i[element.get('categories')]) ws.send(json.dumps({'type': 'data', 'data': i})) # CLEAN MEMORY del df gc.collect() ws.send(json.dumps({'type': 'categories', 'data': categories})) ws.send(json.dumps({'type': 'close'})) # CLEAN MEMORY del categories gc.collect()
def _get_covered(self, pred: pd.DataFrame): catalog = self.catalog return set(pred.query('to_id in @catalog')['to_id'])