# In[101]: a1 # In[106]: sorted(df['PassengerId']) # In[107]: df.columns # In[109]: newp = pd.Categorical(df['Pclass']) # In[110]: newp # In[111]: df['Cabin'] # In[112]: import numpy as np char_cabin = df['Cabin'].astype(str)
df3 = read_parquet(tmp, columns=['f', 'i32'], engine=engine) assert_eq(df3, df3, check_index=False) @pytest.mark.parametrize('df,write_kwargs,read_kwargs', [ (pd.DataFrame({'x': [3, 2, 1]}), {}, {}), (pd.DataFrame({'x': ['c', 'a', 'b']}), { 'object_encoding': 'utf8' }, {}), (pd.DataFrame({'x': ['cc', 'a', 'bbb']}), { 'object_encoding': 'utf8' }, {}), (pd.DataFrame({'x': [b'a', b'b', b'c']}), { 'object_encoding': 'bytes' }, {}), (pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'])}), { 'object_encoding': 'utf8' }, { 'categories': ['x'] }), (pd.DataFrame({'x': pd.Categorical([1, 2, 1])}), {}, { 'categories': ['x'] }), (pd.DataFrame({'x': list(map(pd.Timestamp, [3000, 2000, 1000]))}), {}, {}), (pd.DataFrame({ 'x': [3000, 2000, 1000] }).astype('M8[ns]'), {}, {}), pytest.mark.xfail((pd.DataFrame({ 'x': [3, 2, 1] }).astype('M8[ns]'), {}, {}), reason="Parquet doesn't support nanosecond precision"),
import matplotlib.pyplot as pyt from scipy.stats import linregress pd.set_option("max_rows", 999) pd.set_option("max_columns", 999) # Setting up the Input Path: Input_Path = r"F:/OFZ/OneDrive - Anheuser-Busch InBev/_MUKIL_/00_WORK/00_PROJECTS/14_CHURN_PREDICTION_ONTRADE_UK/03.Output/Model_Input/" Output_Path = r"F:/OFZ/OneDrive - Anheuser-Busch InBev/_MUKIL_/00_WORK/00_PROJECTS/14_CHURN_PREDICTION_ONTRADE_UK/03.Output/Model_Results/" # %% ### Importing the data: Data = pd.read_csv(Input_Path + 'POC_Churn_Status_20082020.csv').drop(columns='key') Data.sort_values(by=['Outlet Id', 'Year', 'Month'], inplace=True) Data_Trans = Data.copy(deep=True) Data_Trans["Year_Month"] = pd.Categorical(Data_Trans["Year_Month"]) # %% Raw_Data = pd.read_csv(Input_Path + 'Unpivoted_Data_Latest_25082020.csv') Raw_Data.head() # %% ### EDA: print('Total Number of POCs by Year: ', Data[Data["Start_Restart"] != 'Inactive']["Outlet Id"].nunique()) print("Volume by Year: ") Data.groupby(["Year"], as_index=False).agg({"Volume": sum}) # %% ### Functions:
a2 = dat1.pivot_table(index=['Policy_Number'],columns='Main_Insurance_Coverage_Group',\ values=['Insured_Amount1', 'Insured_Amount2', 'Insured_Amount3',\ 'Coverage_Deductible_if_applied'],fill_value=0) a3 = dat1.groupby(by ='Policy_Number',axis=0,sort=False).Insurance_Coverage.value_counts().\ reset_index(name='Insurance_Coverage_count') a3 = a3.pivot_table(index='Policy_Number', columns='Insurance_Coverage',\ values='Insurance_Coverage_count',fill_value=0) # 補缺失值 dat1.Vehicle_identifier = dat1.Vehicle_identifier.fillna(dat1.Policy_Number) dat1.Prior_Policy_Number = dat1.Prior_Policy_Number.fillna('0') dat1.count()[dat1.count()<1747942] # 將 Insured's_ID 轉數字 dat1["Insured's_ID"] = pd.Categorical(dat1["Insured's_ID"]) dat1["Insured's_ID"] = dat1["Insured's_ID"].cat.codes dat1.Vehicle_identifier = pd.Categorical(dat1.Vehicle_identifier) dat1.Vehicle_identifier = dat1.Vehicle_identifier.cat.codes dat1.Vehicle_Make_and_Model1 = pd.Categorical(dat1.Vehicle_Make_and_Model1) dat1.Vehicle_Make_and_Model1 = dat1.Vehicle_Make_and_Model1.cat.codes dat1.Vehicle_Make_and_Model2 = pd.Categorical(dat1.Vehicle_Make_and_Model2) dat1.Vehicle_Make_and_Model2 = dat1.Vehicle_Make_and_Model2.cat.codes dat1.Distribution_Channel = pd.Categorical(dat1.Distribution_Channel) dat1.Distribution_Channel = dat1.Distribution_Channel.cat.codes dat1.aassured_zip = pd.Categorical(dat1.aassured_zip) dat1.aassured_zip = dat1.aassured_zip.cat.codes dat1.iply_area = pd.Categorical(dat1.iply_area) dat1.iply_area = dat1.iply_area.cat.codes dat1.Prior_Policy_Number = pd.Categorical(dat1.Prior_Policy_Number)
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) iris = pd.read_table("iris.txt", sep=',', names=('SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm','PetalWidthCm','Species')) print (iris.head()) X = iris.drop('Species',axis=1).values y = pd.Categorical(iris['Species']).codes from sklearn.cluster import KMeans estimators = {'k_means_iris_3': KMeans(n_clusters=3), 'k_means_iris_8': KMeans(n_clusters=8), 'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1, init='random')} import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D fignum = 1 for name, est in estimators.items(): fig = plt.figure(fignum, figsize=(8, 6)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) plt.cla() est.fit(X) labels = est.labels_ ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float)) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([])
def test_constructor_empty_boolean(self): # see gh-22702 cat = pd.Categorical([], categories=[True, False]) categories = sorted(cat.categories.tolist()) assert categories == [False, True]
def corpus_to_df(path, metadata_columns): def load_corpus_text(path_full, column_name): _, filenames, _ = loadCorpus(path_full) texts = [] news = [] unicodes_to_strip = { "\n\n": " ", "\n": " ", "\ufeff": "", "\x85": "", "\x91": "", "\x92": "", "\x93": "", "\x94": "", "\x96": "", "\x97": "", "\t": "" } #print(key for key in unicodes_to_strip) for file_name in filenames: with open(file_name, 'r', encoding='utf8') as text: news = text.read() for key in unicodes_to_strip: news = news.replace(key, unicodes_to_strip[key]) texts.append(news) text_df = pd.DataFrame(texts, columns=[column_name]) return text_df def load_meta(path, metadata_columns): meta_ids = [] meta_filenames = [] meta_tags = [] for filename in os.listdir( os.path.join(path, 'full_texts', 'true-meta-information')): meta_ids.append(filename.replace('-meta.txt', '-REAL')) meta_filenames.append( os.path.join(path, 'full_texts', 'true-meta-information', filename)) meta_tags.append('REAL') # From the fake news folder for filename in os.listdir( os.path.join(path, 'full_texts', 'fake-meta-information')): meta_ids.append(filename.replace('-meta.txt', '-FAKE')) meta_filenames.append( os.path.join(path, 'full_texts', 'fake-meta-information', filename)) meta_tags.append('FAKE') meta_ids, meta_filenames, meta_tags = (list(t) for t in zip( *sorted(zip(meta_ids, meta_filenames, meta_tags)))) meta_ids = pd.DataFrame(meta_ids, columns=['Id']) meta_tags = pd.DataFrame(meta_tags, columns=['Tag']) metadatas = [] for filename in meta_filenames: with open(filename, 'r', encoding='utf8') as text: metadatas.append(text.read().splitlines()) data_df = pd.DataFrame(metadatas, columns=metadata_columns) meta_df = pd.concat([meta_ids, data_df, meta_tags], axis=1) #print(meta_df.head()) #print(metadata_columns) return meta_df news_text_full_df = load_corpus_text(os.path.join(path, 'full_texts'), 'news_text_full') news_text_normalized_df = load_corpus_text( os.path.join(path, 'size_normalized_texts'), 'news_text_normalized') news_meta_df = load_meta(path, metadata_columns) result_df = pd.concat( [news_text_full_df, news_text_normalized_df, news_meta_df], axis=1) #print(result_df) #print(ns.natsorted(result_df['Id'].unique())) result_df['Id'] = pd.Categorical(result_df['Id'], ordered=True, categories=ns.natsorted( result_df['Id'].unique())) result_df = result_df.sort_values('Id') result_df = result_df.set_index('Id') return result_df
import pandas as pd import seaborn as sns import sys import numpy as np img = mpimg.imread('floorplan2.png') #img = cv2.imread('floorplan-n.png') data = pd.read_csv('input.csv') # Transform it to a long format df=data.unstack().reset_index() df.columns=["X","Y","Z"] # And transform the old column name in something numeric df['X']=pd.Categorical(df['X']) df['X']=df['X'].cat.codes # Make the plot fig = plt.figure() ax = fig.gca(projection='3d') ax.set_zlim3d(45, 80) surf=ax.plot_trisurf(df['Y'], df['X'], df['Z'], cmap=plt.cm.jet, linewidth=0.2, vmin=60, vmax=80) height, width = img.shape[:2] # 10 is equal length of x and y axises of your surface stepX, stepY = 64.0/width, 64.0/height
s = pd.Series([1,3,5,np.nan,6,8]) print(s) #日付のSeriesを作成 #2019/01/01から6日間 dates1 = pd.date_range('20190101', periods=6) print(dates1) #A:数値、B:日付、C:1、D:3、E:test,train、F:fooのデータフレーム df = pd.DataFrame({'A':1., 'B':pd.Timestamp('20130102'), 'C':pd.Series(1,index=list(range(4)), dtype='float32'), 'D':np.array([3]*4, dtype='int32'), 'E':pd.Categorical(["test", "train", "test", "train"]), 'F':'foo'}) print(df) #各列の型の確認 print("type of df:") print(df.dtypes) #行列をデータフレームに変換 matrix = np.random.randn(6,4) #6行4列 df2 = pd.DataFrame(matrix, columns=list('ABCD')) #内容:matrix、列名:columns print(df2) #先頭の抽出 #先頭3行
check_index=False, check_divisions=should_check_divs(engine)) df3 = dd.read_parquet(tmp, columns=['f', 'i32'], engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(df[['f', 'i32']], df3, check_index=False, check_divisions=should_check_divs(engine)) @pytest.mark.parametrize('df,write_kwargs,read_kwargs', [ (pd.DataFrame({'x': [3, 2, 1]}), {}, {}), (pd.DataFrame({'x': ['c', 'a', 'b']}), {'object_encoding': 'utf8'}, {}), (pd.DataFrame({'x': ['cc', 'a', 'bbb']}), {'object_encoding': 'utf8'}, {}), (pd.DataFrame({'x': [b'a', b'b', b'c']}), {'object_encoding': 'bytes'}, {}), (pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'])}), {'object_encoding': 'utf8'}, {'categories': ['x']}), (pd.DataFrame({'x': pd.Categorical([1, 2, 1])}), {}, {'categories': ['x']}), (pd.DataFrame({'x': list(map(pd.Timestamp, [3000, 2000, 1000]))}), {}, {}), (pd.DataFrame({'x': [3000, 2000, 1000]}).astype('M8[ns]'), {}, {}), pytest.mark.xfail((pd.DataFrame({'x': [3, 2, 1]}).astype('M8[ns]'), {}, {}), reason="Parquet doesn't support nanosecond precision"), (pd.DataFrame({'x': [3, 2, 1]}).astype('M8[us]'), {}, {}), (pd.DataFrame({'x': [3, 2, 1]}).astype('M8[ms]'), {}, {}), (pd.DataFrame({'x': [3, 2, 1]}).astype('uint16'), {}, {}), (pd.DataFrame({'x': [3, 2, 1]}).astype('float32'), {}, {}), (pd.DataFrame({'x': [3, 1, 2]}, index=[3, 2, 1]), {}, {}), (pd.DataFrame({'x': [3, 1, 5]}, index=pd.Index([1, 2, 3], name='foo')), {}, {}), (pd.DataFrame({'x': [1, 2, 3], 'y': [3, 2, 1]}), {}, {}), (pd.DataFrame({'x': [1, 2, 3],
df2 = pd.read_csv("cluster1.csv") df2 = df2.fillna(0) # In[10]: ### cluster_ transfer the string data into the numerical type name = [] name = [ "CountryCitizen", "CountryLive", "EmploymentField", "EmploymentStatus", "JobApplyWhen", "JobPref", "JobRoleInterest", "JobWherePref", "LanguageAtHome", "MaritalStatus", "SchoolDegree", "SchoolMajor" ] for i in name: df2[i] = pd.Categorical(df2[i]) df2[i] = df2[i].cat.codes # In[71]: ### cluster_ normalize the data X = np.array(df2) X = StandardScaler().fit_transform(X) print(X.shape) # In[97]: ### try different numbers of clustering data_num = X.shape[0] err_clustering = np.zeros([21, 1])
import feather import math # built in import matplotlib.pyplot as plt import numpy as np import os # built in import pandas as pd import random # built in from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix c4 = pd.read_csv('../data/raw/c4-game-database.csv') c4.dropna(inplace=True) for p in range(1, 43): c4[f"pos_{p:02d}"] = pd.Categorical(c4[f"pos_{p:02d}"].astype(int)) c4['winner'] = pd.Categorical(c4['winner'].astype(int)) c4.to_csv('../data/processed/c4-game-database.csv', index=False) # X = ttt.iloc[:, 0:9].values # y = ttt.iloc[:, 9:10].values # # Encode categorical variables as numeric # labelencoder_X = LabelEncoder() # for _ in range(9): # X[:, _] = labelencoder_X.fit_transform(X[:, _]) # # Onehot encode all dependent categorical variables # onehotencoder = OneHotEncoder(categorical_features = [0,1,2,3,4,5,6,7,8])
def tocategory(col): return pd.Categorical(col)
def process_week(config, source, week_file): """Process a single week file * Retrieve the file, extracting the photon and spacecraft info * Select photons near the source, * Determine exposure for the direction * Use the weight table to add weights to photon data, selecting photons with weight info -- in progress -- * Use the exposure to assign an exposure to each photon. """ with open(week_file, 'rb') as inp: week = pickle.load(inp) pdf = _get_photons_near_source(config, source, week) edf = _calculate_exposure_for_source(config, source, week) if config.verbose > 2: print(f'\n\t-->Selected {len(pdf)} photons') # add weights if pdf is None or len(pdf) < 3 or len(edf) == 0: return None, edf add_weights(config, pdf, source) if 'run_id' in pdf: # expint = np.empty(2*len(edf)) estart = edf.start.values estop = edf.stop.values exptime = np.append(estart, estop[-1]) expval = edf.exp.values expcth = edf.cos_theta.values # corresponding cumulative exposure -- in m^2 cumexp = np.insert(np.cumsum(edf.exp.values / 1e4), 0, 0) # i = np.searchsorted(expint[0::2], MJD(pdf.iloc[0].run_id) ) runs = pdf.groupby('run_id') last_run = 0 tau = [] time = [] run_id = [] for run, g in runs: assert run > last_run run_id += [run] * len(g) last_run = run # assemble MJD time from run_id and trun runstart = MJD(float(run)) rtime = MJD(float(run) + g.trun * config.offset_size) time += list(rtime) # cumexp at run start run_cumexp = cumexp[np.searchsorted(estart, runstart)] # use event times in this run to interpolate table of exposure times, cumexp event_cumexp = np.interp(rtime, exptime, cumexp) # diffs, from first --> tau event_exp = np.diff(np.insert(event_cumexp, 0, run_cumexp)) tau += list(event_exp) # # extract cos_theta at event_time? should interplate maybe # cth += expcth[np.searchsorted(rtime, estart )] # update pdf pdf.loc[:, 'tau'] = np.array(tau, np.float32) pdf.loc[:, 'time'] = time pdf.drop(columns='trun', inplace=True) pdf.loc[:, 'run_id'] = pd.Categorical(run_id) else: # zap legacy for now for check in 'etime event run_diff rtime run'.split(): if check in pdf: if config.verbose > 2: print(f'remove {check}') pdf.drop(columns=check, inplace=True) # final attempt to do this pdf.loc[:, 'weight'] = pdf['weight'].astype(np.float32) return pdf, edf
def test_categorical(self, fp): if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"): pytest.skip("CategoricalDtype not supported for older fp") df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) check_round_trip(df, fp)
# # plt.plot(X, y, 'ro') # plt.plot(X, X*lr.coef_ + lr.intercept_) # plt.grid(True) # plt.show() # # print('배달거리 200m일 때 배달시간\n', lr.predict(np.array([[200]]))) # 다변량 선형회귀 분석 # 흡연여부와 임신주차에 따른 신생아 몸무게 예측 path1 = 'C:/Users/TJ/Google 드라이브/학습자료/프로그래밍/data science/Sample data/r/pregnant.txt' mother = pd.read_csv(path1, sep='\t', engine='python') print(mother['Smoke'][:5]) mother['Smoke'] = pd.Categorical(mother['Smoke']) mother['Smoke'] = mother['Smoke'].cat.codes print(mother['Smoke'][:5]) # 산점도 그리기 # plt.plot(mother['Week'], mother['Wgt'], 'go') plt.scatter(mother['Week'], mother['Wgt'], c=mother['Smoke']) plt.show() # 선형 회귀식 만들기 lr = LinearRegression() Xvar = ["Week", "Smoke"] lr.fit(mother[Xvar], mother['Wgt']) print('기울기', lr.coef_) #가중치 weight
def test_constructor_np_strs(self): # GH#31499 Hastable.map_locations needs to work on np.str_ objects cat = pd.Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) assert all(isinstance(x, np.str_) for x in cat.categories)
def read_metadata( fs, paths, categories=None, index=None, gather_statistics=None, filters=None, split_row_groups=True, **kwargs, ): # Define the dataset object to use for metadata, # Also, initialize `parts`. If `parts` is populated here, # then each part will correspond to a file. Otherwise, each part will # correspond to a row group (populated below) parts, dataset = _determine_dataset_parts( fs, paths, gather_statistics, filters, kwargs.get("dataset", {}) ) # Check if the column-chunk file_path's are set in "_metadata". # If available, we can use the path to sort the row-groups col_chunk_paths = False if dataset.metadata: col_chunk_paths = all( dataset.metadata.row_group(i).column(0).file_path is not None for i in range(dataset.metadata.num_row_groups) ) # TODO: Call to `_determine_dataset_parts` uses `pq.ParquetDataset` # to define the `dataset` object. `split_row_groups` should be passed # to that constructor once it is supported (see ARROW-2801). if dataset.partitions is not None: partitions = [ n for n in dataset.partitions.partition_names if n is not None ] if partitions and dataset.metadata: # Dont use dataset.metadata for partitioned datasets, unless # the column-chunk metadata includes the `"file_path"`. # The order of dataset.metadata.row_group items is often # different than the order of `dataset.pieces`. if not col_chunk_paths or ( len(dataset.pieces) != dataset.metadata.num_row_groups ): dataset.schema = dataset.metadata.schema dataset.metadata = None else: partitions = [] # Statistics are currently collected at the row-group level only. # Therefore, we cannot perform filtering with split_row_groups=False. # For "partitioned" datasets, each file (usually) corresponds to a # row-group anyway. # TODO: Map row-group statistics onto file pieces for filtering. # This shouldn't be difficult if `col_chunk_paths==True` if not split_row_groups and not col_chunk_paths: if gather_statistics is None and not partitions: gather_statistics = False if filters: raise ValueError( "Filters not supported with split_row_groups=False " "(unless proper _metadata is available)." ) if gather_statistics and not partitions: raise ValueError( "Statistics not supported with split_row_groups=False." "(unless proper _metadata is available)." ) if dataset.metadata: schema = dataset.metadata.schema.to_arrow_schema() else: schema = dataset.schema.to_arrow_schema() columns = None has_pandas_metadata = ( schema.metadata is not None and b"pandas" in schema.metadata ) if has_pandas_metadata: pandas_metadata = json.loads(schema.metadata[b"pandas"].decode("utf8")) ( index_names, column_names, storage_name_mapping, column_index_names, ) = _parse_pandas_metadata(pandas_metadata) if categories is None: categories = [] for col in pandas_metadata["columns"]: if (col["pandas_type"] == "categorical") and ( col["name"] not in categories ): categories.append(col["name"]) else: index_names = [] column_names = schema.names storage_name_mapping = {k: k for k in column_names} column_index_names = [None] if index is None and index_names: index = index_names if set(column_names).intersection(partitions): raise ValueError( "partition(s) should not exist in columns.\n" "categories: {} | partitions: {}".format(column_names, partitions) ) column_names, index_names = _normalize_index_columns( columns, column_names + partitions, index, index_names ) all_columns = index_names + column_names pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path)) # Check that categories are included in columns if categories and not set(categories).intersection(all_columns): raise ValueError( "categories not in available columns.\n" "categories: {} | columns: {}".format(categories, list(all_columns)) ) dtypes = _get_pyarrow_dtypes(schema, categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} index_cols = index or () meta = _meta_from_dtypes(all_columns, dtypes, index_cols, column_index_names) meta = clear_known_categories(meta, cols=categories) if ( gather_statistics is None and dataset.metadata and dataset.metadata.num_row_groups >= len(pieces) ): gather_statistics = True if not pieces: gather_statistics = False if filters: # Filters may require us to gather statistics if gather_statistics is False and partitions: warnings.warn( "Filtering with gather_statistics=False. " "Only partition columns will be filtered correctly." ) elif gather_statistics is False: raise ValueError("Cannot apply filters with gather_statistics=False") elif not gather_statistics: gather_statistics = True row_groups_per_piece = None if gather_statistics: # Read from _metadata file if dataset.metadata and dataset.metadata.num_row_groups >= len(pieces): row_groups = [ dataset.metadata.row_group(i) for i in range(dataset.metadata.num_row_groups) ] # Re-order row-groups by path name if known if col_chunk_paths: row_groups = sorted( row_groups, key=lambda row_group: natural_sort_key( row_group.column(0).file_path ), ) if split_row_groups and len(dataset.paths) == 1: row_groups_per_piece = _get_row_groups_per_piece( pieces, dataset.metadata, dataset.paths[0], fs ) names = dataset.metadata.schema.names else: # Read from each individual piece (quite possibly slow). row_groups, row_groups_per_piece = _get_md_row_groups(pieces) if row_groups: piece = pieces[0] md = piece.get_metadata() names = md.schema.names else: gather_statistics = False if gather_statistics: stats = [] skip_cols = set() # Columns with min/max = None detected path_last = None for ri, row_group in enumerate(row_groups): s = {"num-rows": row_group.num_rows, "columns": []} for i, name in enumerate(names): if name not in skip_cols: column = row_group.column(i) d = {"name": name} if column.statistics: cs_min = column.statistics.min cs_max = column.statistics.max if not column.statistics.has_min_max: cs_min, cs_max = None, None if None in [cs_min, cs_max] and ri == 0: skip_cols.add(name) continue cs_vals = pd.Series([cs_min, cs_max]) d.update( { "min": cs_vals[0], "max": cs_vals[1], "null_count": column.statistics.null_count, } ) s["columns"].append(d) s["total_byte_size"] = row_group.total_byte_size if col_chunk_paths: s["file_path_0"] = row_group.column(0).file_path if not split_row_groups and (s["file_path_0"] == path_last): # Rather than appending a new "row-group", just merge # new `s` statistics into last element of `stats`. # Note that each stats element will now correspond to an # entire file (rather than actual "row-groups") _merge_statistics(stats, s) continue else: path_last = s["file_path_0"] stats.append(s) else: stats = None if dataset.partitions: for partition in dataset.partitions: if isinstance(index, list) and partition.name == index[0]: meta.index = pd.CategoricalIndex( categories=partition.keys, name=index[0] ) elif partition.name == meta.index.name: meta.index = pd.CategoricalIndex( categories=partition.keys, name=meta.index.name ) elif partition.name in meta.columns: meta[partition.name] = pd.Categorical( categories=partition.keys, values=[] ) # Create `parts` # This is a list of row-group-descriptor dicts, or file-paths # if we have a list of files and gather_statistics=False if not parts: if split_row_groups and row_groups_per_piece: # TODO: This block can be removed after ARROW-2801 parts = [] rg_tot = 0 for i, piece in enumerate(pieces): num_row_groups = row_groups_per_piece[i] for rg in range(num_row_groups): parts.append((piece.path, rg, piece.partition_keys)) # Setting file_path here, because it may be # missing from the row-group/column-chunk stats if "file_path_0" not in stats[rg_tot]: stats[rg_tot]["file_path_0"] = piece.path rg_tot += 1 else: parts = [ (piece.path, piece.row_group, piece.partition_keys) for piece in pieces ] parts = [ { "piece": piece, "kwargs": {"partitions": dataset.partitions, "categories": categories}, } for piece in parts ] return (meta, stats, parts)
def test_constructor_string_and_tuples(self): # GH 21416 c = pd.Categorical( np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index)
["1H", "2H"], np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), None, TimedeltaArray._from_sequence(["1H", "2H"]), ), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), ( ["a", "b"], pd.CategoricalDtype(None, ordered=True), pd.Categorical(["a", "b"], ordered=True), ), # Interval ( [pd.Interval(1, 2, "right"), pd.Interval(3, 4, "right")], "interval", IntervalArray.from_tuples([(1, 2), (3, 4)], "right"), ), # Sparse ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), # IntegerNA