def test_var_std(self): string_series = tm.makeStringSeries().rename('series') datetime_series = tm.makeTimeSeries().rename('ts') alt = lambda x: np.std(x, ddof=1) self._check_stat_op('std', alt, string_series) alt = lambda x: np.var(x, ddof=1) self._check_stat_op('var', alt, string_series) result = datetime_series.std(ddof=4) expected = np.std(datetime_series.values, ddof=4) tm.assert_almost_equal(result, expected) result = datetime_series.var(ddof=4) expected = np.var(datetime_series.values, ddof=4) tm.assert_almost_equal(result, expected) # 1 - element series with ddof=1 s = datetime_series.iloc[[0]] result = s.var(ddof=1) assert pd.isna(result) result = s.std(ddof=1) assert pd.isna(result)
def enrich_company(company_dedup_list, company_scrapy_result, company_colnames): company_scrapy_verify = pd.DataFrame(columns=company_colnames) for index, company in company_dedup_list.iterrows(): if company['db_New'] == False: continue sourceid = company['Source_ID'] scrapy_list = company_scrapy_result[company_scrapy_result['Source_ID'] == sourceid] scrapy_best = scrapy_list[scrapy_list['Confidence'] == 0] # If multiple best match, get first one with address if len(scrapy_best) > 1: if len(scrapy_best[scrapy_best['地址'].notnull()]) > 1: scrapy_best = scrapy_best[scrapy_best['地址'].notnull()].iloc[0].to_frame().transpose() else: scrapy_best = scrapy_best.iloc[0].to_frame().transpose() company = enrich_scrapy(company, scrapy_best) # If no best match, return companies without address elif len(scrapy_best) < 1: if pd.isna(company['Billing_Address']) and pd.isna(company['Billing_Address_CN']): company_scrapy_verify = company_scrapy_verify.append(company.to_frame().transpose()) else: company = enrich_scrapy(company, scrapy_best) company_dedup_list[company_dedup_list['Source_ID'] == company['Source_ID']] = company.to_frame().transpose() company_dedup_list = validate_company(company_dedup_list) company_scrapy_verify = validate_company(company_scrapy_verify) return company_dedup_list, company_scrapy_verify
def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) assert isinstance(index, Float64Index) expected = np.array([1, 2, 3, 4, 5], dtype='float64') tm.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) assert isinstance(index, Float64Index) index = Float64Index([1., 2, 3, 4, 5]) assert isinstance(index, Float64Index) index = Float64Index(np.array([1., 2, 3, 4, 5])) assert isinstance(index, Float64Index) assert index.dtype == float index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 # nan handling result = Float64Index([np.nan, np.nan]) assert pd.isna(result.values).all() result = Float64Index(np.array([np.nan])) assert pd.isna(result.values).all() result = Index(np.array([np.nan])) assert pd.isna(result.values).all()
def test_isna_behavior(idx): # should not segfault GH5123 # NOTE: if MI representation changes, may make sense to allow # isna(MI) msg = "isna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): pd.isna(idx)
def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value if pd.isna(fill_value): assert pd.isna(result) else: assert result == fill_value
def test_idxmin(self): # test idxmin # _check_stat_op approach can not be used here because of isna check. string_series = tm.makeStringSeries().rename('series') # add some NaNs string_series[5:15] = np.NaN # skipna or no assert string_series[string_series.idxmin()] == string_series.min() assert pd.isna(string_series.idxmin(skipna=False)) # no NaNs nona = string_series.dropna() assert nona[nona.idxmin()] == nona.min() assert (nona.index.values.tolist().index(nona.idxmin()) == nona.values.argmin()) # all NaNs allna = string_series * np.nan assert pd.isna(allna.idxmin()) # datetime64[ns] s = Series(pd.date_range('20130102', periods=6)) result = s.idxmin() assert result == 0 s[0] = np.nan result = s.idxmin() assert result == 1
def test_minmax(self): for tz in self.tz: # monotonic idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz=tz) assert idx1.is_monotonic # non-monotonic idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', '2011-01-02', pd.NaT], tz=tz) assert not idx2.is_monotonic for idx in [idx1, idx2]: assert idx.min() == Timestamp('2011-01-01', tz=tz) assert idx.max() == Timestamp('2011-01-03', tz=tz) assert idx.argmin() == 0 assert idx.argmax() == 2 for op in ['min', 'max']: # Return NaT obj = DatetimeIndex([]) assert pd.isna(getattr(obj, op)()) obj = DatetimeIndex([pd.NaT]) assert pd.isna(getattr(obj, op)()) obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) assert pd.isna(getattr(obj, op)())
def test_inferred_dtype(dtype, fill_value): sparse_dtype = SparseDtype(dtype) result = sparse_dtype.fill_value if pd.isna(fill_value): assert pd.isna(result) and type(result) == type(fill_value) else: assert result == fill_value
def _check_fill(meth, op, a, b, fill_value=0): exp_index = a.index.union(b.index) a = a.reindex(exp_index) b = b.reindex(exp_index) amask = isna(a) bmask = isna(b) exp_values = [] for i in range(len(exp_index)): with np.errstate(all='ignore'): if amask[i]: if bmask[i]: exp_values.append(np.nan) continue exp_values.append(op(fill_value, b[i])) elif bmask[i]: if amask[i]: exp_values.append(np.nan) continue exp_values.append(op(a[i], fill_value)) else: exp_values.append(op(a[i], b[i])) result = meth(a, b, fill_value=fill_value) expected = Series(exp_values, exp_index) assert_series_equal(result, expected)
def test_minmax(self): # monotonic idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) assert idx1.is_monotonic # non-monotonic idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) assert not idx2.is_monotonic for idx in [idx1, idx2]: assert idx.min() == Timedelta('1 days') assert idx.max() == Timedelta('3 days') assert idx.argmin() == 0 assert idx.argmax() == 2 for op in ['min', 'max']: # Return NaT obj = TimedeltaIndex([]) assert pd.isna(getattr(obj, op)()) obj = TimedeltaIndex([pd.NaT]) assert pd.isna(getattr(obj, op)()) obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) assert pd.isna(getattr(obj, op)())
def test_replace2(self): N = 100 ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = 'foo' ser[20:30] = 'bar' # replace list with a single value rs = ser.replace([np.nan, 'foo', 'bar'], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all()
def diff_small(): df_ret = pd.read_excel('ret.xlsx') df_src = pd.read_excel('src.xlsx') df_ret.sort_values(by='id_card_number') df_ret_cols = list(df_ret.columns.values) id_nums = df_ret['id_card_number'] df_src_valid = df_src[df_src['id_card_number'].isin(id_nums)] df_src_valid.sort_values(by='id_card_number') exp = 0.00001 discard_cols = [ 'id_card_number', 'score', 'gender', 'id_number_province', ] float_cols = [ 'pdl_credit_24' ] cols_dict = { 'tongdun': ['tongdun_status', 'tongdun_25', 'tongdun_4', 'tongdun_14', 'tongdun_41', 'tongdun_259', 'tongdun_120', 'tongdun_152', 'tongdun_87', 'tongdun_136'], 'call_record': ['call_record_600', 'call_record_441'], 'hj': ['hj_3y_xfnl_5', 'hj_3y_xfnl_score'], 'contact': ['contact_10', 'contact_11'], 'ei': ['education'], 'pdl': ['pdl_credit_24'], } col_data_dict = {col: [] for cols in cols_dict.values() for col in cols} for col in df_ret_cols: if col in discard_cols: continue ret_vals = list(df_ret[col]) src_vals = list(df_src_valid[col]) for i, id_num in enumerate(id_nums): is_ret_vals_na = isna(ret_vals[i]) is_src_vals_na = isna(src_vals[i]) if is_ret_vals_na and is_src_vals_na: continue if is_ret_vals_na or is_src_vals_na: col_data_dict.get(col).append(id_num) else: if col in float_cols: if abs(ret_vals[i] - src_vals[i]) > exp: col_data_dict.get(col).append(id_num) else: if ret_vals[i] != src_vals[i]: col_data_dict.get(col).append(id_num) for key, cols in cols_dict.items(): diff_set = set() for col in cols: diff_set |= set(col_data_dict[col]) print('ids of %s is %s' % (key, sorted(list(diff_set)))) print('count of %s is %s' % (key, len(diff_set)))
def test_NaT_scalar(self): series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]') val = series[3] assert pd.isna(val) series[2] = val assert pd.isna(series[2])
def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') val = series[3] assert isna(val) series[2] = val assert isna(series[2])
def test_where_unsafe(): # see gh-9731 s = Series(np.arange(10), dtype="int64") values = [2.5, 3.5, 4.5, 5.5] mask = s > 5 expected = Series(lrange(6) + values, dtype="float64") s[mask] = values assert_series_equal(s, expected) # see gh-3235 s = Series(np.arange(10), dtype='int64') mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype='int64') mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 def f(): s[mask] = [5, 4, 3, 2, 1] pytest.raises(ValueError, f) def f(): s[mask] = [0] * 5 pytest.raises(ValueError, f) # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) assert_series_equal(result, expected)
def test_minmax_nat(self, op): # Return NaT obj = DatetimeIndex([]) assert pd.isna(getattr(obj, op)()) obj = DatetimeIndex([pd.NaT]) assert pd.isna(getattr(obj, op)()) obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) assert pd.isna(getattr(obj, op)())
def scalar_add(a, b): # TODO; should really be a type specific NA if pd.isna(a) or pd.isna(b): return np.nan if is_integer(a): a = int(a) elif is_integer(b): b = int(b) return a + b
def test_operators_na_handling(self): from decimal import Decimal from datetime import date s = Series([Decimal('1.3'), Decimal('2.3')], index=[date(2012, 1, 1), date(2012, 1, 2)]) result = s + s.shift(1) result2 = s.shift(1) + s assert isna(result[0]) assert isna(result2[0])
def NanCleanerApply(x): #@param x is a column of the dataset maskNan = pd.isna(x) maskNotNan = pd.notna(x) notNan = x[maskNotNan] nan = x[maskNan] avg = int(np.average(notNan)) for i in range (0, len(x)): if(pd.isna(x[i])): x[i] = avg return x
def test_map_int(self): left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) right = Series({1: 11, 2: 22, 3: 33}) assert left.dtype == np.float_ assert issubclass(right.dtype.type, np.integer) merged = left.map(right) assert merged.dtype == np.float_ assert isna(merged['d']) assert not isna(merged['c'])
def test_reindex_boolean(self): frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[0][1]) reindexed = frame.reindex(columns=lrange(3)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[1]).all()
def test_series_setitem( self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data s = ymd['A'] s[2000, 3] = np.nan assert isna(s.values[42:65]).all() assert notna(s.values[:42]).all() assert notna(s.values[65:]).all() s[2000, 3, 10] = np.nan assert isna(s[49])
def getConfidence(company_scrapy): company_search_key = company_scrapy['搜索词'] company_response_name = company_scrapy['公司名称'] if pd.isna(company_search_key) or pd.isna(company_response_name): return None elif hasCHN(company_search_key) and hasCHN(company_response_name): company_search_key = vd.extract_keyword(company_search_key) company_response_name = vd.extract_keyword(company_response_name) return lv.distance(company_search_key,company_response_name) elif not hasCHN(company_search_key) and not hasCHN(company_response_name): return lvEN(company_search_key, company_response_name) else: return None
def test_max_min(self, start, stop, step): # GH17607 idx = RangeIndex(start, stop, step) expected = idx._int64index.max() result = idx.max() assert result == expected expected = idx._int64index.min() result = idx.min() assert result == expected # empty idx = RangeIndex(start, stop, -step) assert isna(idx.max()) assert isna(idx.min())
def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan result = cut(arr, 4) result_arr = np.asarray(result) ex_arr = np.where(isna(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) ex_result = np.where(isna(arr), np.nan, result) tm.assert_almost_equal(result, ex_result)
def test_isna(self, data_missing): expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) expected = SparseArray([True, False], dtype=expected_dtype) result = pd.isna(data_missing) self.assert_equal(result, expected) result = pd.Series(data_missing).isna() expected = pd.Series(expected) self.assert_series_equal(result, expected) # GH 21189 result = pd.Series(data_missing).drop([0, 1]).isna() expected = pd.Series([], dtype=expected_dtype) self.assert_series_equal(result, expected)
def value_counts(self, dropna=True): """ Returns a Series containing counts of unique values. Parameters ---------- dropna : boolean, default True Don't include counts of NaN, even if NaN is in sp_values. Returns ------- counts : Series """ keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0: if self._null_fill_value and dropna: pass else: if self._null_fill_value: mask = pd.isna(keys) else: mask = keys == self.fill_value if mask.any(): counts[mask] += fcounts else: keys = np.insert(keys, 0, self.fill_value) counts = np.insert(counts, 0, fcounts) if not isinstance(keys, pd.Index): keys = pd.Index(keys) result = pd.Series(counts, index=keys) return result
def _try_convert_to_date(self, data): """ try to parse a ndarray like into a date column try to coerce object in epoch/iso formats and integer/float in epcoh formats, return a boolean if parsing was successful """ # no conversion on empty if not len(data): return data, False new_data = data if new_data.dtype == 'object': try: new_data = data.astype('int64') except (TypeError, ValueError, OverflowError): pass # ignore numbers that are out of range if issubclass(new_data.dtype.type, np.number): in_range = (isna(new_data.values) | (new_data > self.min_stamp) | (new_data.values == iNaT)) if not in_range.all(): return data, False date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: new_data = to_datetime(new_data, errors='raise', unit=date_unit) except ValueError: continue except Exception: break return new_data, True return data, False
def test_cut_out_of_bounds(): arr = np.random.randn(100) result = cut(arr, [-1, 0, 1]) mask = isna(result) ex_mask = (arr < -1) | (arr > 1) tm.assert_numpy_array_equal(mask, ex_mask)
def test_identity(klass): assert klass(None) is NaT result = klass(np.nan) assert result is NaT result = klass(None) assert result is NaT result = klass(iNaT) assert result is NaT result = klass(np.nan) assert result is NaT result = klass(float('nan')) assert result is NaT result = klass(NaT) assert result is NaT result = klass('NaT') assert result is NaT assert isna(klass('nat'))
""" import pandas as pd import os import matplotlib.pyplot as plt import scipy.stats as spst results = pd.read_csv(os.path.join('datasets', 'bitzounis_results.csv'), decimal='.') stages = ['circuit', 'carmona', 'sprint', 'laguna'] for s in stages: # max fast_max = results[s + '_' + 'fast_max'] fast_max = fast_max[~pd.isna(fast_max)] slow_max = results[s + '_' + 'slow_max'] slow_max = slow_max[~pd.isna(slow_max)] w_max = spst.wilcoxon(fast_max, slow_max)[1] plt.clf() plt.boxplot([slow_max, fast_max]) plt.title(str(w_max)) plt.savefig(os.path.join('figs_bitz', s + '_' + 'max.png'), dpi=300) # avg fast_avg = results[s + '_' + 'fast_avg'] fast_avg = fast_avg[~pd.isna(fast_avg)]
from SVM import dataLoading, data_preprocessing, missing_data_processing from math import exp import math import tensorflow as tf import pandas as pd import numpy as np import os # 파일 읽어오기 from sklearn.model_selection import train_test_split train, test = dataLoading() x = data_preprocessing(train, test) print(test[pd.isna(test["Fare"])]) # test[] feature_names = [ "Pclass", "Sex", "Fare", "Embarked_C", "Embarked_Q", "Embarked_S" ] X_train = train[feature_names] Y_train = train["Survived"] X_test = test[feature_names] # ######################################################################################################################## # # 러닝 모델 생성 # # DT = 기본 예문
def test_value_counts_datetime64(self): klasses = [Index, Series] for klass in klasses: # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: tm.assert_numpy_array_equal(s.unique(), expected) assert s.nunique() == 3 # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() assert result.index.dtype == 'datetime64[ns]' tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() assert unique.dtype == 'datetime64[ns]' # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) tm.assert_index_equal(unique, exp_idx) else: tm.assert_numpy_array_equal(unique[:3], expected) assert pd.isna(unique[3]) assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
for row in dt[col]: vocab.append(str(row)) # Создаем список токенов: token = sorted(set(vocab)) # Задаем словарь с числовым значением для каждого токена: dictionary = {elem: ind for ind, elem in enumerate(token)} # Применяем словарь к исходной колонке: return dictionary # ### Обработка NAN # ![](http://)Создадим столбцы с информацией о том, где были пропуски. # In[579]: dt['cuis_nan'] = pd.isna(dt['cuis_style']).astype('uint8') dt['price_nan'] = pd.isna(dt['price']).astype('uint8') # ### Price: # Определим все возможные значения в столбце и присвоим им числовые определители. # In[580]: dt['price'].value_counts() # In[581]: # Создадим числовые признаки для колонки Price: token = tokenizer('price') # Применим полученный словарь к столбцу
def test_dti_tz_nat(self, tzstr): idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) assert isna(idx[1]) assert idx[0].tzinfo is not None
def _highlight_null(v, null_color): return ('background-color: {color}'.format( color=null_color) if pd.isna(v) else '')
def dumper(self, collection): if collection == 'DXYArea': structured_results = list() results = self.db.dump(collection=collection) for province_dict in results: if province_dict.get('cities', None): for city_counter in range(len(province_dict['cities'])): city_dict = province_dict['cities'][city_counter] result = dict() result['provinceName'] = province_dict['provinceName'] result['cityName'] = city_dict['cityName'] result['province_confirmedCount'] = province_dict['confirmedCount'] result['province_suspectedCount'] = province_dict['suspectedCount'] result['province_curedCount'] = province_dict['curedCount'] result['province_deadCount'] = province_dict['deadCount'] result['city_confirmedCount'] = city_dict['confirmedCount'] result['city_suspectedCount'] = city_dict['suspectedCount'] result['city_curedCount'] = city_dict['curedCount'] result['city_deadCount'] = city_dict['deadCount'] result['updateTime'] = datetime.datetime.fromtimestamp(province_dict['updateTime']/1000) structured_results.append(result) df = pd.DataFrame(structured_results) df.to_csv( path_or_buf=os.path.join( os.path.split(os.path.realpath(__file__))[0], collection + '.csv'), index=False, encoding='utf_8_sig' ) else: df = pd.DataFrame(data=self.db.dump(collection=collection)) for time_type in time_types: if time_type in df.columns: df[time_type] = df[time_type].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000) if not pd.isna(x) else '') df.to_csv( path_or_buf=os.path.join( os.path.split(os.path.realpath(__file__))[0], collection + '.csv'), index=False, encoding='utf_8_sig' )
def _ohlc(group): if isna(group).all(): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]]
tqdm.write("Loading Data...") df_tt_raw = pd.read_csv(os.path.join(in_root, "train_test.csv")) df_tt_raw.sort_values(by=['id', 'date'], inplace=True) df_tt_raw.set_index(['id'], inplace=True) df_pred = pd.read_csv(os.path.join(in_root, "pred.csv")) df_pred.sort_values(by=['id', 'date'], inplace=True) df_pred.set_index(['id'], inplace=True) # parse command line arguments, loop over targets for target_name in argv[2:]: # display target variables tqdm.write("Target: {}...".format(target_name)) # remove rows missing target df_tt = df_tt_raw[~pd.isna(df_tt_raw["target_" + target_name])] # xgboost hyperparams with open( os.path.join(out_root, "params/", "params_" + target_name + ".txt"), 'r') as f: params = literal_eval(f.read()) # collect unique station_ids station_ids = df_tt.index.unique() # only process a sub sample station_ids = pd.Series(station_ids).sample(frac=float(argv[1]), replace=False).tolist() # initialize k-fold
def prepare_file_form_scraped_data(self): all_files = [] path = r'{0}{1}_{2}*{3}'.format(self.scrape_path, self.scrape_file, self.version, self.csv_ext) all_files += glob.glob(path, recursive=True) li = [] for filename in all_files: df = pd.read_csv(filename, delimiter=self.csv_delimiter, encoding=self.csv_encoding, dtype={'ID': object}) li.append(df) try: frame = pd.concat(li, axis=0, ignore_index=True) except Exception as e: frame = pd.DataFrame() print('No sach files to concatenate {0}'.format(path)) print(str(e)) else: self.save_results(frame, '_final') scrape = frame self.df_scrape = scrape try: df = pd.read_csv(r'{0}{1}'.format(self.start_path, self.obec_words_file), delimiter=self.csv_delimiter, encoding=self.csv_encoding) except Exception as e: df = pd.DataFrame() print( 'Something went wrong on reading ' \ '{0}{1}'.format(self.start_path, self.obec_words_file) ) print(str(e)) else: self.df_words = df try: dfc = pd.read_csv(r'{0}{1}'.format( self.start_path, self.obec_known_characteristics), delimiter=self.csv_delimiter, encoding=self.csv_encoding, dtype={'ID': object}) except Exception as e: dfc = pd.DataFrame() print( 'Something went wrong on reading ' \ '{0}{1}'.format(self.start_path, self.obec_known_characteristics) ) print(str(e)) else: self.df_obec = dfc if not 'Link position' in frame.columns: frame = frame[['ID', 'Suggested URL', 'URL to scrape'] + df['Word'].unique().tolist()] frame = frame.groupby([ 'ID', 'URL' ])[df['Word'].unique().tolist()].sum().reset_index() else: frame = frame[[ 'ID', 'Name', 'Suggested URL', 'URL to scrape', 'Link position', 'Has equal domain', 'Has Simple Suggested URL' ] + df['Word'].unique().tolist()] frame = frame.groupby([ 'ID', 'Name', 'Suggested URL', 'Link position' ])[df['Word'].unique().tolist() \ + [ 'Has equal domain', 'Has Simple Suggested URL' ]].sum().reset_index() frame[[ 'Has equal domain', 'Has Simple Suggested URL' ]] = frame[[ 'Has equal domain', 'Has Simple Suggested URL' ]].where(~(frame[[ 'Has equal domain', 'Has Simple Suggested URL' ]] > 0), other=1) # count to 1 frame[df['Word'].unique().tolist()] = frame[ df['Word'].unique().tolist()].where( ~(frame[df['Word'].unique().tolist()] > 0), other=1) # word count to 1 try: frame = frame.join(dfc.set_index('ID'), on='ID') except Exception as e: frame = pd.DataFrame() print( 'Something went wrong on joining ' \ '{0}{1}_{2}{3} and {4}{5}'.format( self.machine_learning_path, self.scrape_file, self.version, self.csv_ext, self.start_path, self.obec_known_characteristics) ) print(str(e)) else: frame['Known OBEC'] = frame['OBEC'].apply(\ lambda x: 0 if pd.isna(x) else 1) if not 'Link position' in frame.columns: frame['Link position'] = frame.apply( lambda row: 1 if '.'.join( urlparse(row['URL'].lower()).netloc.split( '.')[-2:]) == '.'.join( urlparse(str(row['OBEC']).lower()). netloc.split('.')[-2:]) else 2, axis=1) frame['sum'] = 0 for x in df['Word'].unique().tolist(): frame['sum'] = frame['sum'] + frame[x] if 'Has equal domain' in frame.columns: frame['sum'] = frame['sum'] \ + frame['Has equal domain'] if 'Has Simple Suggested URL' in frame.columns: frame['sum'] = frame['sum'] \ + frame['Has Simple Suggested URL'] frame['Score'] = frame['sum'] \ - frame['sum'] \ * frame['Link position'] / 100 # frame['Score'] = frame['sum'] - frame['sum']/100 self.df_ml = frame self.save_results(frame, '_ml_ready') return [frame, df, dfc, scrape]
def test_reindex_bool_pad(datetime_series): # fail ts = datetime_series[5:] bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) filled_bool = bool_ts.reindex(datetime_series.index, method="pad") assert isna(filled_bool[:5]).all()
import os import pandas as pd SERVER_PORT = int(os.getenv("PORT") or 3000) SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN") SLACK_SIGNING_SECRET = os.getenv("SLACK_SIGNING_SECRET") BOT_SERVICE_CHANNEL = os.getenv("BOT_SERVICE_CHANNEL") slack_users = pd.read_csv(os.getenv("SLACK_USER_IDS"), index_col="tfs_name") slack_users = { name: uid for name, uid in slack_users["id"].to_dict().items() if not pd.isna(name) } slack_ims = {}
def isna(self): nas = pd.isna(self._data.to_pandas()) return type(self).from_scalars(nas)
def get_unique_values(series: pd.Series) -> List: """Returns a list of unique values in a series, including NaNs.""" vals = list(sorted(series.dropna().unique())) if any(pd.isna(series)): vals.append(pd.NA) return vals
def get_pyreadr_column_types(df): """ From a pandas data frame, get an OrderedDict with column name as key and pyreadr column type as value, and also a list with boolean values indicating if the column has missing values (np.nan). The pyreadr column types are needed for downstream processing. """ types = df.dtypes.values.tolist() columns = df.columns.values.tolist() result = OrderedDict() has_missing_values = [False] * len(columns) for indx, (col_name, col_type) in enumerate(zip(columns, types)): # recover original type for categories if type(col_type) is pd.core.dtypes.dtypes.CategoricalDtype: col_type = np.asarray(df[col_name]).dtype if col_type in int_types: result[col_name] = "INTEGER" elif col_type in float_types: result[col_name] = "NUMERIC" elif col_type == np.bool: result[col_name] = "LOGICAL" # np.datetime64[ns] elif col_type == np.dtype('<M8[ns]') or col_type == np.datetime64: result[col_name] = "DATETIME" missing = pd.isna(df[col_name]) if np.any(missing): has_missing_values[indx] = True elif col_type == np.object or col_type in int_mixed_types: missing = pd.isna(df[col_name]) if np.any(missing): has_missing_values[indx] = True if col_type in int_mixed_types: result[col_name] = "INTEGER" continue col = df[col_name].dropna() if len(col): curtype = type(col[0]) equal = col.apply(lambda x: type(x) == curtype) if not np.all(equal): result[col_name] = "OBJECT" continue else: result[col_name] = "LOGICAL" continue else: if col_type in int_mixed_types: result[col_name] = "INTEGER" continue curtype = type(df[col_name][0]) equal = df[col_name].apply(lambda x: type(x) == curtype) if not np.all(equal): result[col_name] = "OBJECT" continue if curtype in int_types: result[col_name] = "INTEGER" elif curtype in float_types: result[col_name] = "NUMERIC" elif curtype == np.bool: result[col_name] = "LOGICAL" elif curtype == str: result[col_name] = "CHARACTER" elif curtype == datetime.date: result[col_name] = "DATE" elif curtype == datetime.datetime: result[col_name] = "DATETIME" else: result[col_name] = "OBJECT" else: # generic object result[col_name] = "OBJECT" return result, has_missing_values
def opbyg_punktoversigt( navn: str, nyetablerede: pd.DataFrame, alle_punkter: Tuple[str, ...], ) -> pd.DataFrame: punktoversigt = pd.DataFrame(columns=list(ARKDEF_PUNKTOVERSIGT)) fire.cli.print("Opbygger punktoversigt") # Forlæng punktoversigt, så der er plads til alle punkter punktoversigt = punktoversigt.reindex(range(len(alle_punkter))) punktoversigt["Punkt"] = alle_punkter # Geninstaller 'punkt'-søjlen som indexsøjle punktoversigt = punktoversigt.set_index("Punkt") nye_punkter = tuple(sorted(set(nyetablerede.index))) try: DVR90 = fire.cli.firedb.hent_srid("EPSG:5799") except KeyError: fire.cli.print("DVR90 (EPSG:5799) ikke fundet i srid-tabel", bg="red", fg="white", err=True) sys.exit(1) for punkt in alle_punkter: if not pd.isna(punktoversigt.at[punkt, "Kote"]): continue if punkt in nye_punkter: continue fire.cli.print(f"Finder kote for {punkt}", fg="green") pkt = fire.cli.firedb.hent_punkt(punkt) # Grav aktuel kote frem kote = None for koord in pkt.koordinater: if koord.srid != DVR90: continue if koord.registreringtil is None: kote = koord break punktoversigt.at[punkt, "Fasthold"] = "" punktoversigt.at[punkt, "System"] = "DVR90" punktoversigt.at[punkt, "uuid"] = "" punktoversigt.at[punkt, "Udelad publikation"] = "" if kote is None: fire.cli.print( f"Ingen aktuel DVR90-kote fundet for {punkt}", bg="red", fg="white", err=True, ) punktoversigt.at[punkt, "Kote"] = None punktoversigt.at[punkt, "σ"] = None punktoversigt.at[punkt, "Hvornår"] = None else: punktoversigt.at[punkt, "Kote"] = kote.z punktoversigt.at[punkt, "σ"] = kote.sz punktoversigt.at[punkt, "Hvornår"] = kote.t if pd.isna(punktoversigt.at[punkt, "Nord"]): punktoversigt.at[punkt, "Nord"] = pkt.geometri.koordinater[1] punktoversigt.at[punkt, "Øst"] = pkt.geometri.koordinater[0] # Nyetablerede punkter er ikke i databasen, så hent eventuelle manglende # koter og lokationskoordinater i fanebladet 'Nyetablerede punkter' for punkt in nye_punkter: if pd.isna(punktoversigt.at[punkt, "Kote"]): punktoversigt.at[punkt, "Kote"] = None if pd.isna(punktoversigt.at[punkt, "Nord"]): punktoversigt.at[punkt, "Nord"] = nyetablerede.at[punkt, "Nord"] if pd.isna(punktoversigt.at[punkt, "Øst"]): punktoversigt.at[punkt, "Øst"] = nyetablerede.at[punkt, "Øst"] # Check op på lokationskoordinaterne for punkt in alle_punkter: λ, φ = normaliser_lokationskoordinat(punktoversigt.at[punkt, "Øst"], punktoversigt.at[punkt, "Nord"]) punktoversigt.at[punkt, "Nord"] = φ punktoversigt.at[punkt, "Øst"] = λ # Reformater datarammen så den egner sig til output return punktoversigt.reset_index()
columns=['kim','park','jung']) print('df:\n',df) print() filename = 'mynan.csv' table = pd.read_csv(filename,encoding='euc-kr', index_col=0) print('table:\n',table) print(type(table)) print(table.size) print(table.isna()) print() print(pd.isna(table)) print() print(table.notnull()) print() table2 = table.dropna() print('table2:\n', table2) print() table2 = table.dropna(how='any') print('table2(any):\n', table2) print() table2 = table.dropna(how='all') print('table2(all):\n', table2)
But : Monter une entreprise de livraison en utilisant des robots qui suivront les itinéraires trouvés par notre algorithmes, il faut donc optimiser les trajets Fichiers : Fourmiam.py """ import pandas import networkx as nx import matplotlib.pyplot as plt map = pandas.read_csv('VOIES_NM.csv', nrows=5300, sep=",", encoding='latin-1') for i in range(0, len(map)): # Check NaN if pandas.isna(map['TENANT'][i]) is True and pandas.isna( map['ABOUTISSANT'][i]) is False: map.loc[map.index[i], 'TENANT'] = i if pandas.isna(map['TENANT'][i]) is False and pandas.isna( map['ABOUTISSANT'][i]) is True: map.loc[map.index[i], 'ABOUTISSANT'] = i if pandas.isna(map['BI_MIN'][i]) is False: bi_min = map['BI_MIN'][i] else: bi_min = 1 if pandas.isna(map['BP_MIN'][i]) is False: bp_min = map['BP_MIN'][i] else:
def preprocess(train): ## GameClock train['GameClock_sec'] = train['GameClock'].apply(strtoseconds) train["GameClock_minute"] = train["GameClock"].apply( lambda x: x.split(":")[0]).astype("object") ## Height train['PlayerHeight_dense'] = train['PlayerHeight'].apply( lambda x: 12 * int(x.split('-')[0]) + int(x.split('-')[1])) ## Time train['TimeHandoff'] = train['TimeHandoff'].apply( lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")) train['TimeSnap'] = train['TimeSnap'].apply( lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")) train['TimeDelta'] = train.apply( lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1) train['PlayerBirthDate'] = train['PlayerBirthDate'].apply( lambda x: datetime.datetime.strptime(x, "%m/%d/%Y")) ## Age seconds_in_year = 60 * 60 * 24 * 365.25 train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff'] - row[ 'PlayerBirthDate']).total_seconds() / seconds_in_year, axis=1) train["PlayerAge_ob"] = train['PlayerAge'].astype(np.int).astype("object") ## WindSpeed # train['WindSpeed_ob'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x) # train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x) # train['WindSpeed_ob'] = train['WindSpeed_ob'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x) # train['WindSpeed_dense'] = train['WindSpeed_ob'].apply(strtofloat) ## Weather train['GameWeather_process'] = train['GameWeather'].str.lower() train['GameWeather_process'] = train['GameWeather_process'].apply( lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x) train['GameWeather_process'] = train['GameWeather_process'].apply( lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy'). replace('party', 'partly') if not pd.isna(x) else x) train['GameWeather_process'] = train['GameWeather_process'].apply( lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x) train['GameWeather_process'] = train['GameWeather_process'].apply( lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x) train['GameWeather_dense'] = train['GameWeather_process'].apply( map_weather) ## Rusher train['IsRusher'] = (train['NflId'] == train['NflIdRusher']) train['IsRusher_ob'] = ( train['NflId'] == train['NflIdRusher']).astype("object") temp = train[train["IsRusher"]][["Team", "PlayId" ]].rename(columns={"Team": "RusherTeam"}) train = train.merge(temp, on="PlayId") train["IsRusherTeam"] = train["Team"] == train["RusherTeam"] ## dense -> categorical train["Quarter_ob"] = train["Quarter"].astype("object") train["Down_ob"] = train["Down"].astype("object") train["JerseyNumber_ob"] = train["JerseyNumber"].astype("object") train["YardLine_ob"] = train["YardLine"].astype("object") # train["DefendersInTheBox_ob"] = train["DefendersInTheBox"].astype("object") # train["Week_ob"] = train["Week"].astype("object") # train["TimeDelta_ob"] = train["TimeDelta"].astype("object") ## Orientation and Dir train["Orientation_ob"] = train["Orientation"].apply( lambda x: orientation_to_cat(x)).astype("object") train["Dir_ob"] = train["Dir"].apply( lambda x: orientation_to_cat(x)).astype("object") train["Orientation_sin"] = train["Orientation"].apply( lambda x: np.sin(x / 360 * 2 * np.pi)) train["Orientation_cos"] = train["Orientation"].apply( lambda x: np.cos(x / 360 * 2 * np.pi)) train["Dir_sin"] = train["Dir"].apply( lambda x: np.sin(x / 360 * 2 * np.pi)) train["Dir_cos"] = train["Dir"].apply( lambda x: np.cos(x / 360 * 2 * np.pi)) ## diff Score train["diffScoreBeforePlay"] = train["HomeScoreBeforePlay"] - train[ "VisitorScoreBeforePlay"] train["diffScoreBeforePlay_binary_ob"] = ( train["HomeScoreBeforePlay"] > train["VisitorScoreBeforePlay"]).astype("object") ## Turf Turf = { 'Field Turf': 'Artificial', 'A-Turf Titan': 'Artificial', 'Grass': 'Natural', 'UBU Sports Speed S5-M': 'Artificial', 'Artificial': 'Artificial', 'DD GrassMaster': 'Artificial', 'Natural Grass': 'Natural', 'UBU Speed Series-S5-M': 'Artificial', 'FieldTurf': 'Artificial', 'FieldTurf 360': 'Artificial', 'Natural grass': 'Natural', 'grass': 'Natural', 'Natural': 'Natural', 'Artifical': 'Artificial', 'FieldTurf360': 'Artificial', 'Naturall Grass': 'Natural', 'Field turf': 'Artificial', 'SISGrass': 'Artificial', 'Twenty-Four/Seven Turf': 'Artificial', 'natural grass': 'Natural' } train['Turf'] = train['Turf'].map(Turf) ## OffensePersonnel temp = train["OffensePersonnel"].iloc[np.arange( 0, len(train), 22)].apply(lambda x: pd.Series(OffensePersonnelSplit(x))) temp.columns = ["Offense" + c for c in temp.columns] temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)] train = train.merge(temp, on="PlayId") ## DefensePersonnel temp = train["DefensePersonnel"].iloc[np.arange( 0, len(train), 22)].apply(lambda x: pd.Series(DefensePersonnelSplit(x))) temp.columns = ["Defense" + c for c in temp.columns] temp["PlayId"] = train["PlayId"].iloc[np.arange(0, len(train), 22)] train = train.merge(temp, on="PlayId") ## sort # train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index(drop = True) train = train.sort_values(by=['X']).sort_values(by=['Dis']).sort_values( by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop=True) return train
def stringify(s): return str(s) if not pd.isna(s) else None
if_gdp = _env.odir_root + '/summary_' + ds + '/country_specific_statistics_GDP_' + ds + '_' + p_scen + '_Burke.xls' if_ctrylist = _env.idir_root + '/regioncode/Country_List.xls' if_ctryshp = (_env.idir_root + '/shape/country/country1.shp') itbl_gdp = pd.read_excel(if_gdp, 'country-lag0') itbl_gdp.set_index('iso', inplace=True) ishp_ctry = gp.read_file(if_ctryshp) #correct country code ishp_ctry.loc[ishp_ctry['GMI_CNTRY'] == 'ROM', 'GMI_CNTRY'] = 'ROU' ishp_ctry.loc[ishp_ctry['GMI_CNTRY'] == 'ZAR', 'GMI_CNTRY'] = 'COD' ishp_ctry.set_index('GMI_CNTRY', inplace=True) ishp_ctry['GDP_median'] = itbl_gdp['GDP_median_benefit_ratio'] ishp_ctry.loc[pd.isna(ishp_ctry['GDP_median']), 'GDP_median'] = -999 _env.mkdirs(_env.odir_root + 'gdp_map_' + ds) ishp_ctry.to_file(_env.odir_root + 'gdp_map_' + ds + '/gdp_country_' + p_scen + '.shp') ishp_ctry.drop('geometry', axis=1).to_csv(_env.odir_root + 'gdp_map_' + ds + '/country_gdp_ratio_median_' + ds + '.csv') ax = fig.add_subplot(414) m = Basemap(ellps='WGS84', llcrnrlon=-180, llcrnrlat=-90, urcrnrlon=177.5, urcrnrlat=90., suppress_ticks=False)
def check_fun_data(self, testfunc, targfunc, testarval, targarval, targarnanval, check_dtype=True, empty_targfunc=None, **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval if skipna and empty_targfunc and isna(targartempval).all(): targ = empty_targfunc(targartempval, axis=axis, **kwargs) else: targ = targfunc(targartempval, axis=axis, **kwargs) try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna: res = testfunc(testarval, axis=axis, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if axis is None: res = testfunc(testarval, skipna=skipna, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna and axis is None: res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) except BaseException as exc: exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1), 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) raise if testarval.ndim <= 1: return try: testarval2 = np.take(testarval, 0, axis=-1) targarval2 = np.take(targarval, 0, axis=-1) targarnanval2 = np.take(targarnanval, 0, axis=-1) except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, targarnanval2, check_dtype=check_dtype, empty_targfunc=empty_targfunc, **kwargs)
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._ndarray_values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == o.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetimetz(o): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original assert o.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: exp = Series(nanloc, o.index, name='a') tm.assert_series_equal(pd.isna(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == 'a' result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == 'a' result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan vals = values[2:].astype(object).values tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert o.nunique() == 8 assert o.nunique(dropna=False) == 9
def integrify(x): return int(float(x)) if not pd.isna(x) else None
def set_partition( df: DataFrame, index: Union[str, Series], divisions: Sequence, max_branch: int = 32, drop: bool = True, shuffle: Optional[str] = None, compute: Optional[bool] = None, ) -> DataFrame: """Group DataFrame by index Sets a new index and partitions data along that index according to divisions. Divisions are often found by computing approximate quantiles. The function ``set_index`` will do both of these steps. Parameters ---------- df: DataFrame/Series Data that we want to re-partition index: string or Series Column to become the new index divisions: list Values to form new divisions between partitions drop: bool, default True Whether to delete columns to be used as the new index shuffle: str (optional) Either 'disk' for an on-disk shuffle or 'tasks' to use the task scheduling framework. Use 'disk' if you are on a single machine and 'tasks' if you are on a distributed cluster. max_branch: int (optional) If using the task-based shuffle, the amount of splitting each partition undergoes. Increase this for fewer copies but more scheduler overhead. See Also -------- set_index shuffle partd """ meta = df._meta._constructor_sliced([0]) if isinstance(divisions, tuple): # pd.isna considers tuples to be scalars. Convert to a list. divisions = list(divisions) if not isinstance(index, Series): dtype = df[index].dtype else: dtype = index.dtype if pd.isna(divisions).any() and pd.api.types.is_integer_dtype(dtype): # Can't construct a Series[int64] when any / all of the divisions are NaN. divisions = df._meta._constructor_sliced(divisions) elif (pd.api.types.is_categorical_dtype(dtype) and UNKNOWN_CATEGORIES in dtype.categories): # If categories are unknown, leave as a string dtype instead. divisions = df._meta._constructor_sliced(divisions) else: divisions = df._meta._constructor_sliced(divisions, dtype=dtype) if not isinstance(index, Series): partitions = df[index].map_partitions(set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions) else: partitions = index.map_partitions(set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions, _index=index) df3 = rearrange_by_column( df2, "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, shuffle=shuffle, compute=compute, ignore_index=True, ) if not isinstance(index, Series): df4 = df3.map_partitions( set_index_post_scalar, index_name=index, drop=drop, column_dtype=df.columns.dtype, ) else: df4 = df3.map_partitions( set_index_post_series, index_name=index.name, drop=drop, column_dtype=df.columns.dtype, ) df4.divisions = tuple(methods.tolist(divisions)) return df4.map_partitions(M.sort_index)
def sort_values( df: DataFrame, by: Union[str, List[str]], npartitions: Optional[Union[int, Literal["auto"]]] = None, ascending: Union[bool, List[bool]] = True, na_position: Union[Literal["first"], Literal["last"]] = "last", upsample: float = 1.0, partition_size: float = 128e6, sort_function: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, sort_function_kwargs: Optional[Mapping[str, Any]] = None, **kwargs, ) -> DataFrame: """See DataFrame.sort_values for docstring""" if na_position not in ("first", "last"): raise ValueError("na_position must be either 'first' or 'last'") if not isinstance(by, list): by = [by] if len(by) > 1 and df.npartitions > 1 or any(not isinstance(b, str) for b in by): raise NotImplementedError( "Dataframes only support sorting by named columns which must be passed as a " "string or a list of strings; multi-partition dataframes only support sorting " "by a single column.\n" "You passed %s" % str(by)) sort_kwargs = { "by": by, "ascending": ascending, "na_position": na_position, } if sort_function is None: sort_function = M.sort_values if sort_function_kwargs is not None: sort_kwargs.update(sort_function_kwargs) if df.npartitions == 1: return df.map_partitions(sort_function, **sort_kwargs) if npartitions == "auto": repartition = True npartitions = max(100, df.npartitions) else: if npartitions is None: npartitions = df.npartitions repartition = False sort_by_col = df[by[0]] divisions, mins, maxes = _calculate_divisions(df, sort_by_col, repartition, npartitions, upsample, partition_size) if len(divisions) == 2: return df.repartition(npartitions=1).map_partitions( sort_function, **sort_kwargs) if not isinstance(ascending, bool): # support [True] as input if (isinstance(ascending, list) and len(ascending) == 1 and isinstance(ascending[0], bool)): ascending = ascending[0] else: raise NotImplementedError( f"Dask currently only supports a single boolean for ascending. You passed {str(ascending)}" ) if (all(not pd.isna(x) for x in divisions) and mins == sorted(mins, reverse=not ascending) and maxes == sorted(maxes, reverse=not ascending) and all(mx < mn for mx, mn in zip( maxes[:-1] if ascending else maxes[1:], mins[1:] if ascending else mins[:-1], )) and npartitions == df.npartitions): # divisions are in the right place return df.map_partitions(sort_function, **sort_kwargs) df = rearrange_by_divisions( df, by, divisions, ascending=ascending, na_position=na_position, duplicates=False, ) df = df.map_partitions(sort_function, **sort_kwargs) return df
def control_loading(): d_control_data = {} df = Facture.load_db() # Load table manager table_man = TableLoader(Facture.l_index, Facture.l_fields()) # App 1 table of bill waiting for visa ref_date = pd.Timestamp('1970-01-01') df['date_visa'] = df.date_visa.apply(lambda x: pd.Timestamp(x)) df['date_payed'] = df.date_payed.apply(lambda x: pd.Timestamp(x)) df_, d_footer, kwargs = table_man.load_full_table( df.loc[df.date_visa.apply( lambda x: pd.isna(x) or x == '' or x == ref_date)]) d_control_data['tablenovisa'] = { 'table': { 'df': df_.copy(), 'd_footer': d_footer, 'kwargs': kwargs, 'key': 'nothing' }, 'rows': [('title', [{ 'content': 'title', 'value': u'Factures en attente de visa', 'cls': 'text-center' }]), ('Table', [{ 'content': 'table' }])], 'rank': 0 } # App 2 table of bill waiting for payment df_, d_footer, kwargs = table_man.load_full_table( df.loc[~df.date_visa.apply( lambda x: pd.isna(x) or x == '' or x == ref_date) & df.date_payed.apply( lambda x: pd.isna(x) or x == '' or x == ref_date)]) d_control_data['tablenopayement'] = { 'table': { 'df': df_.copy(), 'd_footer': d_footer, 'kwargs': kwargs, 'key': 'visa' }, 'rows': [('title', [{ 'content': 'title', 'value': u'Factures en attente de paiement', 'cls': 'text-center' }]), ('Table', [{ 'content': 'table' }])], 'rank': 1 } # App 3 table of bill payed df_, d_footer, kwargs = table_man.load_full_table( df.loc[df.date_payed > ref_date]) d_control_data['tablepayment'] = { 'table': { 'df': df_, 'd_footer': d_footer, 'kwargs': kwargs, 'key': 'payement' }, 'rows': [('title', [{ 'content': 'title', 'value': u'Factures encaissées', 'cls': 'text-center' }]), ('Table', [{ 'content': 'table' }])], 'rank': 2 } return d_control_data
def __row_isna_check(self, row): return pd.isna(row[1]['Solar Index'])
def compute_confison_matrix(predicted_df, ground_truth_df, iou_intersection_th = 0.4 ,debug = False, calculate_distance = False, minimum_distance = None): """ This piece of code essentially computes the confusion matrix between Ground Truth and predicted Video :param predicted_df -- prediction dataframe :param ground_truth_df -- groundtruth dataframe :param iou_intersection_th -- IOU iou_intersection_th threshold :param calculate_distance -- If True , compute the centroid distance between true and predicted boxes :param minimum_distance -- Distance threshold between two pred and true boxes """ # Defining the Output variables which has to be returned result = {} # Defining the variables which wil be used in the computation tp = 0 tn = 0 unique_all_gt_box = [] unique_all_pred_box= [] df_pred = predicted_df.copy() # df_pred = alter_predicted_csv(predicted_df = df_pred) GT = ground_truth_df.copy() # Setting up an dataframe to store the details of matched predicted box with GroundTruth Box count_correctness = pd.DataFrame(columns=['NAME','IOU_THRESHOLD', 'GT_BOX', 'MATCHED_PRED_BOX', 'IOU_BTWN_GT_AND_PRED_BOX', 'COUNT_OF_OTHER_BOXES_MATCHED_WITH_GT_FOR_GIVEN_IOU']) # Processing the frames in the csv to compute its confusion matrix for frame_name in GT['name'].unique().tolist(): # Values changing w.r.t. every frame in the given video file bbox_num_mapping = {} bbox_with_iou = {} matched_pred_box = {} # Getting all the GroundTruth Frames for the given FRAME NAME or FRAME NO from GroundTruth dataframe cur_frame_gt_boxes = GT[GT['name'] == frame_name].values.tolist() for box in cur_frame_gt_boxes: bbox_num_mapping[box[1]] = box[2:10] bbox_with_iou [box[1]] = 0.0 unique_all_gt_box.append(box[2:10]) # Getting all the prediction for the current frame from the predictions csv cur_frame_predictions = df_pred[df_pred['name'] == frame_name].values.tolist() if cur_frame_predictions: cur_frame_pred_boxes = np.array(cur_frame_predictions[0][1:]).reshape(-1,4).tolist() cur_frame_pred_boxes = [box for box in cur_frame_pred_boxes if not pd.isna(box[0])] for pred_box in cur_frame_pred_boxes : unique_all_pred_box.append(pred_box) pred_box_poly = make_polygon_from_4_coordinates(box = pred_box) per_pred_box_iou = {} for box_no , gt_box in bbox_num_mapping.items(): gt_box_poly = make_polygon_from_8_coordinates(box = gt_box) iou = gt_box_poly.intersection(pred_box_poly).area / gt_box_poly.union(pred_box_poly).area if calculate_distance: x_centroid, y_centroid = get_centroid(box = gt_box) x_centroid_p, y_centroid_p = get_centroid_from_4_coordinates(box = pred_box) centroid_dis = centroid_distance(x_centroid, y_centroid, x_centroid_p, y_centroid_p) if (iou >= iou_intersection_th and centroid_dis < minimum_distance): per_pred_box_iou[box_no] = iou else: if (iou >= iou_intersection_th): per_pred_box_iou[box_no] = iou if per_pred_box_iou: matched_gt_box_no = max(per_pred_box_iou, key = per_pred_box_iou.get) if per_pred_box_iou[matched_gt_box_no] > bbox_with_iou[matched_gt_box_no]: bbox_with_iou[matched_gt_box_no] = per_pred_box_iou[matched_gt_box_no] matched_pred_box[matched_gt_box_no] = pred_box tp += len(matched_pred_box) for gt_box_no , mat_box in matched_pred_box.items(): count_correctness = count_correctness.append({'NAME' : frame_name, 'IOU_THRESHOLD' : iou_intersection_th, 'GT_BOX' : bbox_num_mapping[gt_box_no], 'MATCHED_PRED_BOX' : mat_box, 'IOU_BTWN_GT_AND_PRED_BOX' : bbox_with_iou[gt_box_no]}, ignore_index = True) # False Positives - unique all predicted boxes - true predicted boxes ( TRUE POSITIVES) fp = len(unique_all_pred_box) - tp assert len(GT) == len(unique_all_gt_box) # False Negatives - unique all GroundTruth boxes - true predicted boxes ( TRUE POSITIVES) fn = len(unique_all_gt_box) - tp # Computation of Confusion Matrix if tp == 0: acc = 0 recall = 0 precision = 0 f_measure = 0 else: acc = (tp+tn)/(tp+tn+fp+fn) recall = tp/(tp+fn) precision = tp/(tp+fp) f_measure = (2*recall*precision) / (recall+precision) # Assigning the results to the result['tp'] = tp result['fp'] = fp result['fn'] = fn result['tn'] = tn result['acc'] = acc * 100 result['precision'] = precision result['recall'] = recall result['f_measure'] = f_measure if debug: result['count_correctness'] = count_correctness return result