def missing_values_table(df: pd.DataFrame) -> pd.DataFrame: """Количество не заполненных значений в DataFrame. https://habr.com/post/414613/ :param df: pd.DataFrame: """ # Всего недостает mis_val = df.isnull().sum() # Процент недостающих данных mis_val_percent = 100 * df.isnull().sum() / len(df) # Таблица с результатами mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) # Переименование столбцов mis_val_table_ren_columns = \ mis_val_table.rename( columns={0: 'Missing Values', 1: '% of Total Values'} ) # Сортировка про процентажу mis_val_table_ren_columns = \ mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0]\ .sort_values('% of Total Values', ascending=False)\ .round(6) # Инфо print("В выбранном датафрейме " + str(df.shape[1]) + " столбцов.\n" "Всего " + str(mis_val_table_ren_columns.shape[0]) + " столбцов с неполными данными.") # Возврат таблицы с данными return mis_val_table_ren_columns
def clean(in_data: pd.DataFrame) -> pd.DataFrame: assert 'question1' in in_data.columns assert 'question2' in in_data.columns print("removing nan") in_data = in_data[~in_data.isnull()] in_data = in_data[(~in_data["question1"].isna()) & (~in_data["question2"].isna())] print("fixing contractions") in_data['question1'] = np.vectorize(contractions.fix)(in_data['question1']) in_data['question2'] = np.vectorize(contractions.fix)(in_data['question2']) print("fixing emoji") in_data['question1'] = np.vectorize(emoji.demojize)(in_data['question1']) in_data['question2'] = np.vectorize(emoji.demojize)(in_data['question2']) print("cleaning") in_data['question1'] = clean_sentence(in_data['question1']) in_data['question2'] = clean_sentence(in_data['question2']) in_data = in_data[~in_data.isnull()] in_data = in_data[(~in_data["question1"].isna()) & (~in_data["question2"].isna())] in_data['question1'] = in_data['question1'].str.lower() in_data['question2'] = in_data['question2'].str.lower() return in_data
def get_missing_values(df: pd.DataFrame) -> pd.DataFrame: """ Ze zadaneho dataframu zjistete chybejici hodnoty. Vyvorte DataFrame, ktery bude obsahovat v indexu jednotlive promenne a ve prvnim sloupci bude promenna 'Total' obsahujici celkovy pocet chybejicich hodnot a ve druhem sloupci promenna 'Percent', ve ktere bude procentualni vyjadreni chybejicich hodnot vuci celkovemu poctu radku v tabulce. DataFrame seradte od nejvetsich po nejmensi hodnoty. Vrattre DataFrame chybejicich hodnot a celkovy pocet chybejicich hodnot. Priklad: | Total | Percent "Column1" | 34 | 76 "Column2" | 0 | 0 """ column = list(df) total_missing = df.isnull().sum().sum() sum_of_value = df.isnull().sum() sum_of_value.values percent = list() for i in sum_of_value.values: percent.append(round(i / total_missing * 100, 2)) df_new = pd.DataFrame(list(zip(sum_of_value.values, percent)), columns=['Total', 'Percent'], index=column) return df_new
def fill_na(all_data: pd.DataFrame): all_data_na = get_na(all_data) # 欠損値があるカラムをリスト化 na_col_list = all_data.isnull().sum()[ all_data.isnull().sum() > 0].index.tolist() #欠損があるカラムのデータ型を確認 all_data[na_col_list].dtypes.sort_values() #欠損値が存在するかつfloat型のリストを作成 float_list = all_data[na_col_list].dtypes[all_data[na_col_list].dtypes == "float64"].index.tolist() #欠損値が存在するかつobject型のリストを作成 obj_list = all_data[na_col_list].dtypes[all_data[na_col_list].dtypes == "object"].index.tolist() #float型の場合は欠損値を0で置換 all_data[float_list] = all_data[float_list].fillna(0) #object型の場合は欠損値を"None"で置換 all_data[obj_list] = all_data[obj_list].fillna("None") #欠損値が全て置換できているか確認 all_data.isnull().sum()[all_data.isnull().sum() > 0] return all_data
def smart_cut(df:pd.DataFrame, threshold=0, weight_col=1): """Drop columns and raws with many nan, minimizing the loss of informative data-points Args: threshold : percentage of nan left in the dataframe wheight_col : importance of columns, to penalize dropping of columns more (weight>1) or less (weight<1) Returns: df (pd.DataFrame): clean dataframe """ while (df.isnull().sum().sum())/(df.shape[0] * df.shape[1]) > threshold: worst_row = np.argmax(df.isnull().sum(1)) worst_row_value = np.max(df.isnull().sum(1)) worst_col = np.argmax(df.isnull().sum(0)) worst_col_value = np.max(df.isnull().sum(0)) # criterium : minimize loss of valid data-points if (df.shape[1] - worst_row_value) <= (weight_col*(df.shape[0] - worst_col_value)): df = df.drop(worst_row) else: df = df.drop(worst_col,axis=1) return df
def missing_values_table(df: pd.DataFrame) -> pd.DataFrame: """ Counts and calculates null values per column :param df: features's DataFrame :return: """ # Total missing values mis_val = df.isnull().sum() # Percentage of missing values mis_val_percent = 100 * df.isnull().sum() / len(df) # Make a table with the results mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) # Rename the columns mis_val_table_ren_columns = mis_val_table.rename( columns={0: 'Missing Values', 1: '% of Total Values'}) # Sort the table by percentage of missing descending mis_val_table_ren_columns = mis_val_table_ren_columns[ mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values( '% of Total Values', ascending=False).round(1) # Print some summary information print("Hay " + str(df.shape[1]) + " columnas.\n" "Hay " + str(mis_val_table_ren_columns.shape[0]) + " columnas con valores nulos") # Return the dataframe with missing information return mis_val_table_ren_columns
def order_by_nan(df:pd.DataFrame): """Order dataframe (row and columns) according to presence of data, i.e. more data row-columns on top-left""" new_index = df.isnull().sum(1).sort_values().index new_col_index = df.isnull().sum().sort_values().index df = df.reindex(new_index) df = df.reindex(columns=new_col_index) return df
def hasnulls(df: pd.DataFrame, verbose: bool = False): nulls = df.isnull().sum().sum() print_c(verbose, "Number of nulls", nulls) if nulls > 0: print_c(verbose, df.isnull().sum().sort_values(ascending=False)) return True else: return False
def summary_missing_data(df: pd.DataFrame, lowest_proportion: float = 0.0) -> pd.DataFrame: total = df.isnull().sum().sort_values(ascending=False) percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Count', 'Percent']) return missing_data[missing_data['Percent'] > lowest_proportion]
def missing_table(df: pd.DataFrame): null_val = df.isnull().sum() percent = 100 * df.isnull().sum() / len(df) missing_table = pd.concat([null_val, percent], axis=1) missing_table_len_columns = missing_table.rename(columns={ 0: "欠損数", 1: "%" }) return missing_table_len_columns
def display_missing_data(df: pd.DataFrame) -> pd.DataFrame: total = df.isnull().sum().sort_values(ascending=False) percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) return missing_data
def check_manynull_cols(input_df: pd.DataFrame, null_rate: float = 0.3): """欠損の割合が一定以上の列をprintするだけ Args: input_df(pd.DataFrame): input_df null_rate(float): 欠損が全体の何割以上の列を表示するか Return: nothing """ n = len(input_df) * null_rate print(input_df.isnull().sum()[input_df.isnull().sum() > n])
def filter_ferc714_hourly_demand_matrix( df: pd.DataFrame, min_data: int = 100, min_data_fraction: float = 0.9, ) -> pd.DataFrame: """ Filter incomplete years from FERC 714 hourly demand matrix. Nulls respondent-years with too few data and drops respondents with no data across all years. Args: df: FERC 714 hourly demand matrix, as described in :func:`load_ferc714_hourly_demand_matrix`. min_data: Minimum number of non-null hours in a year. min_data_fraction: Minimum fraction of non-null hours between the first and last non-null hour in a year. Returns: Hourly demand matrix `df` modified in-place. """ # Identify respondent-years where data coverage is below thresholds has_data = ~df.isnull() coverage = ( # Last timestamp with demand in year has_data[::-1].groupby(df.index.year[::-1]).idxmax() - # First timestamp with demand in year has_data.groupby(df.index.year).idxmax()).apply( lambda x: 1 + x.dt.days * 24 + x.dt.seconds / 3600, axis=1) fraction = has_data.groupby(df.index.year).sum() / coverage short = coverage.lt(min_data) bad = fraction.gt(0) & fraction.lt(min_data_fraction) # Set all values in short or bad respondent-years to null mask = (short | bad).loc[df.index.year] mask.index = df.index df[mask] = np.nan # Report nulled respondent-years for mask, msg in [ (short, 'Nulled short respondent-years (below min_data)'), (bad, 'Nulled bad respondent-years (below min_data_fraction)'), ]: row, col = mask.values.nonzero() report = (pd.DataFrame({ 'id': mask.columns[col], 'year': mask.index[row] }).groupby('id')['year'].apply(lambda x: np.sort(x))) with pd.option_context('display.max_colwidth', -1): logger.info(f'{msg}:\n{report}') # Drop respondents with no data blank = df.columns[df.isnull().all()].tolist() df.drop(columns=blank, inplace=True) # Report dropped respondents (with no data) logger.info(f'Dropped blank respondents: {blank}') return df
def missing_data_ratio(df: pd.DataFrame, display=False): # missing data total = df.isnull().sum().sort_values(ascending=False) percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) if display: print(missing_data.head(20)) return missing_data
def remove_features_by_missing_data_ratio(df: pd.DataFrame, fraction: float = 0.15, missing_data=None): if missing_data is None: missing_data = missing_data_ratio(df) # dealing with missing data df = df.drop((missing_data[missing_data['Percent'] > fraction]).index, 1) # df = df.drop(df.loc[df['Electrical'].isnull()].index) df.isnull().sum().max( ) # just checking that there's no missing data missing... return df
def all_valid_verification(X: pd.DataFrame, y: pd.DataFrame) -> bool: """ A verification method requires all entries in both feature set and label set to be non-null. This function depends on specific case. """ if np.any(X.isnull()): return False elif np.any(y.isnull()): return False elif len(X) == 0: return False return True
def kesson_table(df: pd.DataFrame) -> pd.DataFrame: """generate table that contains lack count and percentage Arguments: df {pd.DataFrame} -- source data Returns: pd.DataFrame -- columns={0: "欠損数", 1: "%"} """ null_val = df.isnull().sum() percent = 100 * df.isnull().sum() / len(df) kesson_table = pd.concat([null_val, percent], axis=1) kesson_table_ren_columns = kesson_table.rename(columns={0: "欠損数", 1: "%"}) return kesson_table_ren_columns
def reduce_vars_corr(df: pd.DataFrame, field_names: list, max_num: float, imputer: str = 'knnimpute'): num_vars = len(field_names) - 1 print('Current vars: {0}'.format(num_vars)) if not max_num or max_num < 1: if max_num == 0: max_num = 0.5 max_num = int(np.power(df.shape[0], max_num)) print('Max allowed vars: {0}'.format(max_num)) if num_vars > max_num: if df.isnull().any().any(): imputed_df, field_names = impute_if_any_nulls( df.loc[:, field_names].astype(float)) for n in field_names: df[n] = imputed_df[n] # Creates Correlation Matrix corr_matrix = df.loc[:, field_names].corr() max_corr = [(fld, corr_matrix.iloc[i + 1, :i].max()) for i, fld in reverse_enumerate(field_names[1:])] max_corr.sort(key=lambda tup: tup[1]) return_x_vals = [fld for fld, corr in max_corr[:max_num]] print('Number of Remaining Fields: {0}'.format(len(return_x_vals))) print('Remaining Fields: {0}'.format(return_x_vals)) return df, return_x_vals return df, field_names
def impute_if_any_nulls(impute_df: pd.DataFrame, verbose: bool = False): from fancyimpute import BiScaler, NuclearNormMinimization, MatrixFactorization, IterativeSVD impute_names = impute_df.columns.values.tolist() impute_index = impute_df.index.values for imputer in [ BiScaler, NuclearNormMinimization, MatrixFactorization, IterativeSVD ]: if impute_df.isnull().any().any(): print( f'Imputation: Null values are in the DF. Running imputation using "{imputer.__name__}"' ) impute_df = imputer(verbose=verbose).fit_transform( impute_df.values) impute_df = pd.DataFrame(data=impute_df, columns=impute_names, index=impute_index) else: break # else: # print('Imputation: Unable to eliminate all NULL values from the dataframe! FIX THIS!') for n in impute_names.copy(): if impute_df[n].isnull().any().any(): print('Field [{0}] was still empty after imputation! Removing it!'. format(n)) impute_names.remove(n) return impute_df, impute_names
def remove_nans( counterfactuals: pd.DataFrame, factuals: pd.DataFrame = None ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]: """Remove instances for which a counterfactual could not be found. Parameters ---------- counterfactuals: Has to be the same shape as factuals. factuals: Has to be the same shape as counterfactuals. (optional) Returns ------- """ # get indices of unsuccessful counterfactuals nan_idx = counterfactuals.index[counterfactuals.isnull().any(axis=1)] output_counterfactuals = counterfactuals.copy() output_counterfactuals = output_counterfactuals.drop(index=nan_idx) if factuals is not None: if factuals.shape[0] != counterfactuals.shape[0]: raise ValueError( "Counterfactuals and factuals should contain the same amount of samples" ) output_factuals = factuals.copy() output_factuals = output_factuals.drop(index=nan_idx) return output_counterfactuals, output_factuals return output_counterfactuals
def _check_Xy(X: pd.DataFrame, y: pd.Series, *, norm_y=False) -> Tuple[pd.Series, pd.Series]: if np.ndim(X) == 1: X = pd.Series(X).to_frame() elif np.ndim(X) == 2: X = pd.DataFrame(X) assert X.ndim == 2 assert np.ndim(y) == 1 assert len(X) == len(y) valid = ~X.isnull().any(1).values X = pd.Series(list(zip(*X.values[valid].T)), name=tuple(X.columns)).astype('category') y = pd.Series(y).reset_index(drop=True)[valid] if is_object_dtype(y): y = pd.Categorical(y) if norm_y: assert is_numeric_dtype(y) y = (y - y.mean()) / y.std() return X, y
def remove_nan(df: pd.DataFrame) -> pd.DataFrame: if df.isnull().values.any(): print(f'Data not OK, removing nan values..') print() nan_values = [] indices = list(np.arange(df.shape[1])) for j in range(df.shape[1]): nan_values.append(df[j].isnull().sum().sum()) print(f'Before:') print(f"Indices: {indices}") #index of feature print(f"NaN values: {nan_values}" ) #number of nan values corresponding to each feature print() df = df.fillna(df.median()) #replacing nan with median nan_values = [] indices = list(np.arange(df.shape[1])) for j in range(df.shape[1]): nan_values.append(df[j].isnull().sum().sum()) print(f'After:') print(f"Indices: {indices}") #index of feature print(f"NaN values: {nan_values}" ) #number of nan values corresponding to each feature print() else: print(f"Data has no NaN values") return df
def check_if_valid_data(df: pd.DataFrame) -> bool: # Check if dataframe is empty if df.empty: print("No songs downloaded. Finishing execution") return False # Primary Key Check if pd.Series(df['played_at']).is_unique: pass else: raise Exception("Primary Key check is violated") # Check for nulls if df.isnull().values.any(): raise Exception("Null values found") # Check that all timestamps are of yesterday's date yesterday = datetime.datetime.now() - datetime.timedelta(days=1) yesterday = yesterday.replace(hour=0, minute=0, second=0, microsecond=0) timestamps = df["timestamp"].tolist() for timestamp in timestamps: if datetime.datetime.strptime(timestamp, '%Y-%m-%d') != yesterday: raise Exception( "At least one of the returned songs does not have a yesterday's timestamp" ) return True
def check_data(df: pd.DataFrame) -> bool: list_error = [] # is empty ? message_empty = "Nothing downloaded. Perhaps you listened no one song that day. Execution Finished" assert not df.empty, list_error.append(message_empty) # is duplicates ? message_keys = "Primary Key is violated, hence there are duplicates in data" assert pd.Series( df["played_at"]).is_unique, list_error.append(message_keys) # is null values ? message_null = "Null values found" assert not df.isnull().values.any(), list_error.append(message_null) # is it last 24 hours ? '''yesterday_ = datetime.datetime.now() - datetime.timedelta(days=1) yesterday_ = yesterday_.replace(hour=0, minute=0, second=0, microsecond=0) today_ = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) yesterday_list = [yesterday_, today_] timestamps = df["timestamp"].to_list() for timestamp in timestamps: if datetime.datetime.strptime(timestamp, "%Y-%m-%d") not in yesterday_list: print(yesterday_) list_error.append("Something wrong with timestamps") raise Exception("Something wrong with timestamps")''' if list_error: for error in list_error: print(error) return False return True
def _check_metadata(self, df: pd.DataFrame): """Check metadata LOCATIONS tab has valid format.""" # Check columns cols = ["location", "source_name", "automated", "include"] cols_missing = [col for col in cols if col not in df.columns] cols_wrong = [col for col in df.columns if col not in cols] if cols_missing: raise ValueError(f"LOCATIONS missing column(s): {cols_missing}.") if cols_wrong: raise ValueError(f"LOCATIONS has invalid column(s): {cols_wrong}.") # Check duplicated rows location_counts = df.location.value_counts() if (location_counts > 1).any(None): locations_dup = location_counts[location_counts > 1].index.tolist() raise ValueError( f"Duplicated location(s) found in LOCATIONS. Check {locations_dup}" ) if df.isnull().any(None): raise ValueError( f"Check LOCATIONS. Some fields missing (empty / NaNs)") # Ensure booleanity of columns automated, include if not df.automated.isin([True, False]).all(): vals = df.automated.unique() raise ValueError( f"LOCATIONS column `automated` should only contain TRUE/FALSE. Check {vals}" ) if not df.include.isin([True, False]).all(): vals = df.include.unique() raise ValueError( f"LOCATIONS column `include` should only contain TRUE/FALSE. Check {vals}" )
def _fully_connect_outflows(self, outflows_data: pd.DataFrame) -> pd.DataFrame: """Helper function for get_data_inputs that ensures outflows_data is fully connected.""" # Handle sparse outflow events where a disaggregation is missing data for some time steps fully_connected_columns = self.data_dict["disaggregation_axes"] + [ "compartment", "outflow_to", "time_step", ] outflows_data = (outflows_data.groupby(fully_connected_columns) ["total_population"].sum().unstack( level=["time_step"])) # Raise a warning if there are any disaggregations without outflow records for more than 25% of the time steps missing_event_threshold = 0.25 number_of_missing_events = outflows_data.isnull().sum(axis=1) sparse_disaggregations = number_of_missing_events[ number_of_missing_events / len(outflows_data.columns) > missing_event_threshold] if not sparse_disaggregations.empty: warn( f"Outflows data is missing for more than {missing_event_threshold * 100}% for some disaggregations:\n" f"{100 * sparse_disaggregations / len(outflows_data.columns)}") # Fill the total population with 0 and remove the multiindex for the population simulation return (outflows_data.fillna(0).stack("time_step").reset_index( name="total_population"))
def get_data_metadata(X: DataFrame, y: Series) -> dict: X_raw = convert_to_raw(X) feature_metadata_orig = FeatureMetadata.from_df(X) feature_metadata_raw = FeatureMetadata.from_df(X_raw) num_rows, num_cols = X.shape num_null = X.isnull().sum().sum() try: problem_type = infer_problem_type(y, silent=True) except: # TODO: Remove, only here for legacy compatibility problem_type = infer_problem_type(y) if problem_type in ['binary', 'multiclass']: num_classes = len(y.unique()) else: num_classes = None data_metadata = { 'num_rows': num_rows, 'num_cols': num_cols, 'num_null': num_null, 'num_classes': num_classes, 'problem_type': problem_type, 'feature_metadata': feature_metadata_orig, 'feature_metadata_raw': feature_metadata_raw, } # TODO: class imbalance # TODO: has_text # TODO: has_special # TODO: memory size return data_metadata
def __init__(self, evidence_dataframe: pd.DataFrame): """ L(data|xi, signal *or* noise) = L(data|signal)*xi +(1-xi)*L(data|noise) where xi --> p(signal) Parameters ---------- evidence_dataframe: pandas dataframe """ bilby.Likelihood.__init__( self, parameters={ DUTY_CYCLE: None, GLITCH_H1_DUTY_CYCLE: None, GLITCH_L1_DUTY_CYCLE: None, }, ) nan_present = evidence_dataframe.isnull().values.any() assert not nan_present, "NaN present in the evidence dataframe!" self.log_evidence = evidence_dataframe[rkeys.LOG_EVIDENCE].values self.log_noise_evidence = evidence_dataframe[ rkeys.LOG_NOISE_EVIDENCE].values self.log_glitch_H_evidence = evidence_dataframe[ rkeys.LOG_GLITCH_H_EVIDENCE].values self.log_glitch_L_evidence = evidence_dataframe[ rkeys.LOG_GLITCH_L_EVIDENCE].values
def add_no_response_ratio(df_clean_log: pd.DataFrame, df_whole_v4_dirty: pd.DataFrame): """ Compute new features: no_response_ratio and invalid_ratio. For each interview, calculate the proportion of no-response and invalid answer. """ df_clean_log = df_clean_log.drop_duplicates() df_invalid = df_clean_log[['question', "uuid"]].groupby("uuid").count() # remove all dummy columns for col in df_whole_v4_dirty.columns: if '/' in col: df_whole_v4_dirty = df_whole_v4_dirty.drop(col, axis=1) df_whole_v4_dirty['no_response_ratio'] = df_whole_v4_dirty.isnull().sum( axis=1) df_whole_v4_dirty['invalid_ratio'] = 0 _, n_col = df_whole_v4_dirty.shape df_whole_v4_dirty = df_whole_v4_dirty.apply(cal_no_r_count, axis=1, df_invalid=df_invalid) df_whole_v4_dirty[ 'no_response_ratio'] = df_whole_v4_dirty['no_response_ratio'] / n_col df_whole_v4_dirty[ 'invalid_ratio'] = df_whole_v4_dirty['invalid_ratio'] / n_col return df_whole_v4_dirty['no_response_ratio'], df_whole_v4_dirty[ 'invalid_ratio']
def df_missing_vals(df: pd.DataFrame) -> pd.Series: """ return Series containing the number of NaNs for each column that contains at least one """ null_counts = df.isnull().sum() return null_counts[null_counts > 0]
def input_design(self, value: pd.DataFrame): if not value.equals(self.input_design): # if dataframe is diferent from the current one in display, warn # the about layout changes, and reset case number and status cols self.layoutAboutToBeChanged.emit() self._input_design = value self.dataChanged.emit( self.index(0, 0), self.index(self.rowCount(), self.columnCount())) self.is_input_design_generated = False \ if value.isnull().all(axis=None) else True self.input_design_changed.emit(self.is_input_design_generated) self.layoutChanged.emit() # number of experiments n_samp = self.app_data.doe_lhs_settings['n_samples'] if self.app_data.doe_lhs_settings['inc_vertices']: n_samp += 2**len(self._input_alias) # include vertices # create empty dataframes with NaN values self._case_num = pd.DataFrame({'case': np.arange(1, n_samp + 1)}) self._status_sim = pd.DataFrame({'status': [''] * n_samp}, dtype=object) # reset sampled data as well self.samp_data = pd.DataFrame(np.nan, index=range(n_samp), columns=self._output_alias, dtype=float)
type(np.nan) ## create a sample data setdd zip1 = zip([2, 4, 8], [np.nan, 5, 7], [np.nan, np.nan, 22]) df1 = DataFrame(zip1, columns = ['a', 'b', 'c']) df1 ## finding missing values with pandas DataFrame method`isnull` and numpy `isnan` ## returns boolean values where True/False # search a whole dataframe df1.isnull() np.isnan(df1) # search specific columns cols = ['a', 'c'] # create a list of column keys df1[cols] df1[cols].isnull() # also works on a series df1['b'] df1['b'].isnull() # pandas also has a negation of `isnull`, `notnull` df1.isnull() df1.notnull() df1.isnull() == df1.notnull() # all false! perfectly opposite
数据框的空值处理 """""""""""""""""""""""""""""""""""""""""""""""""""""""""" #%% from pandas import DataFrame from string import letters d = DataFrame(arange(100.0).reshape(10,10),columns=list(letters[:10])) #%% d[d%13==0]=np.nan d[d%17==0]=np.nan d[(d>=80) & (d<90)]=np.nan #%% d #%% 所有含有空值的行全部删除 d.dropna() #%% 只删除全部是空值的数据 d.dropna(how='all') #%% 只要有一列数据为空就删除 d.dropna(how='any') #%% 要求至少有9列数据不为空 d.dropna(thresh=9) #%% 按列进行删除 d.dropna(thresh=8,axis=1) #%% DataFrame没有isnull方法 d.isnull() #%% 空值填充 d.fillna(0) #%% d.fillna(dict(zip(letters[:10],range(-1,-11,-1)))) # 注意:使用dropna时默认使用的坐标是行,使用fillna时默认是使用列 #%% d.fillna(method='ffill',limit=1)
class DataWorker(object): def feat_value2int(series): all_values = list(enumerate(np.unique(series))) value_dict = {name : i for i,name in all_values} return value_dict def __init__(self,data=None): """ Init DataWorker with pandas.DataFrame Otherwise make sure that the rdata can be transformed to DataFrame. """ if data is None: self.__data = {} if isinstance(data,DataFrame): self.__data = data.copy() else: self.__data = DataFrame(data) self.__featureDict = None @property def featureDict(self): self.__data.select_dtypes(include=['object']) @featureDict.setter def featureDict(self,value): pass @property def data(self): return self.__data @data.setter def data(self,df): self.data = df def getColNamesWithNan(self): s = self.__data.isnull().any() return s.index[s==True].tolist() def dataClean(self,transDict = None,fillna={'all':'most_frequent'},yCol = -1): """ yCol: the col you wanna predict fillna: {columnn:method_name} dictionary default:{'all':'most_frequent'} provied functions are : 'most_frequent','mean','median','first_n_frequent,n'(where the last n is a number) when key =='all' : fill column which include na with the same function, this key is suggested to put at the end """ # try to map all data to numeric self.__data = cd.fillna(self.__data,fillna) if transDict == None: self.__featureDict if yCol != -1: self.__data = cd.change_yCol(self.__data,yCol) def algorithmUsing(): pass def showFeagure(): pass def getResult(): pass
df[0:1] # to index a row use ranges df.ix[0] # .ix() indexes rows but outputs the row as a column df.ix[1:2,] ['col1']] = 666 # to index or modify a specific cell df3 = df.ix[['A','B','C','D','E','F'],new_columns] # .ix() can be used to add rows and columns simultaneously df2 = df.reindex(['A','B','C','D','E','F']) # .reindex() to add new rows (in this case 'C') df2 = df.reindex(columns=new_columns) # .reindex() can also add new columns df3['newCol'] = a_list # you can use lists to fill columns df4 = df3.drop('newCol', axis = 1) # drop a column df5 = df4.rename(columns={'col1':'test1'}) # rename a column df6 = df5.rename(index={'A':'Alpha'}) # rename an index dflist = df['col2'].tolist() # convert a column to a list df6.index = df6.index.map(str.lower) # bulk convert all indexes to lower case df7 = df6.rename(index=str.title, columns=str.title) # bulk convert indexes and columns to capital first letter df > 0 # conditional operators create new, boolean dataframes df.isnull() # check entire dataframe for NaNs ############################################################################################################# # 3. Sorting Dataframes ############################################################################################################# df.sort_index() # sort descending by index df.sort(columns = ['Col1','Col2'], ascending = True, inplace = True) # sort by specified column ############################################################################################################# # 4. Missing Values ############################################################################################################# df5 = DataFrame([[1,2,3,np.nan],[np.nan,5,6,7],[7,np.nan,9,np.nan],[np.nan,np.nan,np.nan,np.nan]])
(?P<domain>[A-Z0-9.-]+) \. (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE) m = regex.match('*****@*****.**') m.groupdict() ###pandas中矢量化的字符串函数 data = {'Dave': '*****@*****.**', 'Steve': '*****@*****.**', 'Rob': '*****@*****.**', 'Wes': np.nan} data = Series(data) data data.isnull() data.str.contains('gmail') pattern data.str.findall(pattern, flags=re.IGNORECASE) matches = data.str.match(pattern, flags=re.IGNORECASE) matches matches.str.get(1) matches.str[0] data.str[:5]
def create_log_features2(ids, feature_df): """Creates log features2 Parameters ---------- ids : data frame with id column feature_df : data frame with columns id - id log_feature - name of feature in the form "log_feature space feature's number" ex. log_feature 56 volume - volume Return ------ df : data frame with columns id - id max_log_feature - maximum feature's number min_log_feature - minimum feature's number median_log_feature - median feature's number count_log_feature - number of different features max_volume - maximum volume min_volume - minimum volume median - median volume count_volume - the same as count_log_features, consider dropping! """ print "CREATING LOG FEATURES" start_time = time.time() all_df = DataFrame(ids, columns=['id']) feature_df['log_feature'] = feature_df['log_feature'].apply(lambda val: int(val.split()[1])) names = ('log_feature', 'volume') for name in names: gdf = feature_df[['id', name]].groupby('id', as_index=False) max_feature = gdf.max() max_feature.columns = ['id', 'max_' + name] all_df = pd.merge(all_df, max_feature, how='inner', on='id') min_feature = gdf.min() min_feature.columns = ['id', 'min_' + name] all_df = pd.merge(all_df, min_feature, how='inner', on='id') median_feature = gdf.median() median_feature.columns = ['id', 'median_' + name] all_df = pd.merge(all_df, median_feature, how='inner', on='id') count_feature = gdf.count() count_feature.columns = ['id', 'count_' + name] all_df = pd.merge(all_df, count_feature, how='inner', on='id') # check whether there are null entries if sum(sum(1 * all_df.isnull().values)) > 0: print "ERROR: there are null entries in the all df data frame" elapsed_time = time.time() - start_time; print "ALL FEATURES WERE SUCCESFULLY CREATED. TIME ELAPSED " + str(elapsed_time) + "sec." return all_df
# QUICK TIP: you can repeat lists by multiplying! [1,2,3] [1,2,3]*3 # types missing data None np.nan type(None) type(np.nan) ## create a sample data set zip1 = zip([2,4,8], [np.nan, 5, 7], [np.nan, np.nan, 22]) df1 = DataFrame(zip1, columns = ['a', 'b', 'c']) ## search for missing data using df1.isnull() # pandas method to find missing data np.isnan(df1) # numpy way ## subset of columns cols = ['a', 'c'] df1[cols] df1[cols].isnull() ## for series df1['b'].isnull() ## find non-missing values df1.isnull() df1.notnull() df1.isnull() == df1.notnull()
# b) Para aumentar la información de nuestros datos, concatenar a nuestro # DataFrame un objeto DataFrame con la siguiente información: new_data = {'equipo': ['Atletico de Madrid'], 'titulos': [29], 'socios': [48008]} equipos = equipos.append(new_data,ignore_index=True) # c) Crear una nueva columna 'posicion' con los siguientes datos: posicion_values = ['13', np.nan, '3', np.nan, '5', np.nan] equipos['posicion'] = posicion_values # d) Mostrar la posicion de los elementos que son NA en nuestro DataFrame. # Esto mostraría las filas completas equipos[equipos.isnull().any(axis=1)] # Esto mostraría solo las posiciones, entendiendo estas como los índices equipos[equipos.isnull().any(axis=1)].index # e) Mostrar nuestro DataFrame sin las filas con elementos NA. equipos.dropna() string_data = Series(['aardvark', 'artichoke', np.nan,'avocado']) string_data string_data.isnull() string_data[0] = None string_data.isnull()