def drop_reqpeat01(): data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]}) print data print data.duplicated() print data.drop_duplicates() data['v1']=range(7) print data.drop_duplicates(['k1']) print data print data.drop_duplicates(['k1','k2'],keep='last')
def test_frame_datetime64_duplicated(): dates = date_range('2010-07-01', end='2010-08-05') tst = DataFrame({'symbol': 'AAA', 'date': dates}) result = tst.duplicated(['date', 'symbol']) assert (-result).all() tst = DataFrame({'date': dates}) result = tst.duplicated() assert (-result).all()
def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({'A': [0, 0, 1], 'B': [0, 0, 1], 'C': [0, 0, 1]}) with pytest.raises(KeyError): df.duplicated(subset) with pytest.raises(KeyError): df.drop_duplicates(subset)
def slide_21(): import json db = json.load(open(FOODJSONPATH)) print len(db) print db[0].keys() print db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] print "今から全部のnutrientsを扱うよ" nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print "なんか重複多い" print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() print "infoとnutrients両方にdescriptionとgroupがあるから変えよう" col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) ndata = pd.merge(nutrients, info, on='id', how='outer') print ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') plt.show() by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] max_foods.food = max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def slide_10(): data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) print data print data.duplicated() print data.duplicated('k1') print data.drop_duplicates() data['v1'] = range(7) print data print data.drop_duplicates(['k1']) print data.drop_duplicates(['k1', 'k2'], take_last=True)
class Duplicated(object): def setup(self): n = (1 << 20) t = date_range('2015-01-01', freq='S', periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), 'b': np.random.choice(t, n), 'c': np.random.choice(xs, n)}) self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): self.df.duplicated() def time_frame_duplicated_wide(self): self.df2.duplicated()
def submit(self, df: pd.DataFrame, job_opts: JobOpts, deplay=0.02, progressbar=True): """Sumit jobs to the cluster. You have to establish a connection first (explicit is better than implicit). Examples: >>> with js.connect(): ... js.submit([(0, 'echo "Hello world!"), (1, 'echo "Goodbye world!"')] """ assert 'system_command' in df assert not df.duplicated().any() job_opts.working_dir.joinpath(job_opts.job_id).mkdir(parents=True, exist_ok=True) if self.host_opts.scheme in ['local']: worker = functools.partial(self._local_worker, job_opts=job_opts) else: worker = functools.partial(self._remote_worker, job_opts=job_opts) # Submit multiple jobs in parallel futures = [] pool = concurrent.futures.ThreadPoolExecutor() for row in self._itertuples(df, progressbar=progressbar): future = pool.submit(worker, row) futures.append(future) time.sleep(deplay) pool.shutdown(wait=False) return futures
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid): dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])] # print(dfm_duplicated) dfm_stk_strc.drop_duplicates('变动日期',inplace=True) for index, row in dfm_duplicated.iterrows(): # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因'] dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因'] logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
def test_duplicated_on_empty_frame(): # GH 25184 df = DataFrame(columns=['a', 'b']) dupes = df.duplicated('a') result = df[dupes] expected = df.copy() tm.assert_frame_equal(result, expected)
def deal_string02(): import json db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json')) print len(db) print db[0] print db[0].keys() print db[0]['nutrients'][0] nutrients=DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys=['description','group','id','manufacturer'] info=DataFrame(db,columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] nutrients=[] for rec in db: fnuts=DataFrame(rec['nutrients']) fnuts['id']=rec['id'] nutrients.append(fnuts) nutrients=pd.concat(nutrients,ignore_index=True) print nutrients print nutrients.duplicated().sum() nutrients=nutrients.drop_duplicates() col_mapping={'description':'food','group':'fgroup'} info=info.rename(columns=col_mapping,copy=False) print info col_mapping={'description':'nutrient','group':'nutgroup'} nutrients=nutrients.rename(columns=col_mapping,copy=False) print nutrients ndata=pd.merge(nutrients,info,on='id',how='outer') print ndata print ndata.ix[3000] result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5) # print result result['Zinc, Zn'].sort_values().plot(kind='barh') by_nutrient=ndata.groupby(['nutgroup','nutrient']) get_maximum=lambda x:x.xs(x.value.idxmax()) get_minimum=lambda x:x.xs(x.value.idmin()) max_foods=by_nutrient.apply(get_maximum)[['value','food']] max_foods.food=max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def test_duplicated_do_not_fail_on_wide_dataframes(): # gh-21524 # Given the wide dataframe with a lot of columns # with different (important!) values data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)} df = DataFrame(data).T result = df.duplicated() # Then duplicates produce the bool Series as a result and don't fail during # calculation. Actual values doesn't matter here, though usually it's all # False in this case assert isinstance(result, Series) assert result.dtype == np.bool_
def jhu_world_normalize(df: pd.DataFrame) -> pd.DataFrame: # We are just gonna do per country data in this CSV file df = df.drop(columns=['Province/State']) # Change column names to the ones we use df = df.rename(columns={ 'Country/Region': 'country', 'Lat': 'latitude', 'Long': 'longitude' }) # Remove duplicate countries dup_countries = {'Bahamas, The', 'Congo (Brazzaville)'} df = df[~df.country.isin(dup_countries)] # Change Korea, South to South Koreaa df.country = df.country.replace({ 'Korea, South': 'South Korea', 'US': 'United States', 'The Bahamas': 'Bahamas', 'Congo (Kinshasa)': 'Democratic Republic of the Congo', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'Cruise Ship': 'Diamond Princess', 'Cote d\'Ivoire': 'Ivory Coast' }) # JHU data has a PK of (region, country) so we need to sum up the rows that are dates for each one dup = df[df.duplicated(['country'])] dup = dup.groupby('country').sum() dup = dup.reset_index() dup.latitude = 0 dup.longitude = 0 # Unique countries df = df[~df.duplicated(['country'], keep=False)] df = df.append(dup) return df
def handle_data(data, labels): print("All data sets size: " + str(len(data))) test_data = [] train_data = [] for i in range(len(data)): if i % DIVISION_SCOPE == 0: test_data.append(data[i]) else: train_data.append(data[i]) print("Train Sets size: " + str(len(train_data))) print("Test Sets size: " + str(len(test_data))) test_data_df = DataFrame(test_data, columns=labels) train_data_df = DataFrame(train_data, columns=labels) # Clean Data Frame a_d = train_data_df.duplicated() b_d = test_data_df.duplicated() train_data_df = train_data_df.drop_duplicates() test_data_df = test_data_df.drop_duplicates() print("The Duplicated items in Train Sets size: " + str(len(a_d) - len(train_data_df))) print("The Duplicated items in Test Sets size: " + str(len(b_d) - len(test_data_df))) a_n = train_data_df.isnull() b_n = test_data_df.isnull() train_data_df = train_data_df.dropna() test_data_df = test_data_df.dropna() print("The Null items in Train Sets size: " + str(len(a_n) - len(train_data_df))) print("The Null items in Test Sets size: " + str(len(b_n) - len(test_data_df))) print("Clean Done. The Train Data Sets size: " + str(len(train_data_df))) print("Clean Done. The Test Data Sets size: " + str(len(test_data_df))) # train_data_df = train_data_df.drop(['used_app_before', 'contry_of_res', 'austim', 'jundice', # 'ethnicity', 'gender', 'age', 'A10_Score'], axis=1) # test_data_df = test_data_df.drop(['used_app_before', 'contry_of_res', 'austim', 'jundice', # 'ethnicity', 'gender', 'age', 'A10_Score'], axis=1) # train_data_df = train_data_df.drop(['age_desc'], axis=1) # test_data_df = test_data_df.drop(['age_desc'], axis=1) return test_data_df, train_data_df
def describe_table(df: pd.DataFrame, variable_stats: pd.DataFrame) -> dict: """General statistics for the DataFrame. Args: df: The DataFrame to describe. variable_stats: Previously calculated statistic on the DataFrame. Returns: A dictionary that contains the table statistics. """ n = len(df) memory_size = df.memory_usage(index=True).sum() record_size = float(memory_size) / n table_stats = { "n": n, "nvar": len(df.columns), "memsize": memory_size, "recordsize": record_size, "n_cells_missing": variable_stats.loc["n_missing"].sum(), "n_vars_with_missing": sum((variable_stats.loc["n_missing"] > 0).astype(int)), "n_vars_all_missing": sum((variable_stats.loc["n_missing"] == n).astype(int)), } table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / ( table_stats["n"] * table_stats["nvar"]) supported_columns = variable_stats.transpose()[variable_stats.transpose( ).type != Variable.S_TYPE_UNSUPPORTED].index.tolist() table_stats["n_duplicates"] = (sum(df.duplicated( subset=supported_columns)) if len(supported_columns) > 0 else 0) table_stats["p_duplicates"] = ((table_stats["n_duplicates"] / len(df)) if (len(supported_columns) > 0 and len(df) > 0) else 0) # Variable type counts table_stats.update({k.value: 0 for k in Variable}) table_stats.update( dict(variable_stats.loc["type"].apply( lambda x: x.value).value_counts())) table_stats[Variable.S_TYPE_REJECTED.value] = ( table_stats[Variable.S_TYPE_CONST.value] + table_stats[Variable.S_TYPE_CORR.value] + table_stats[Variable.S_TYPE_RECODED.value]) return table_stats
def proportionBySport(df: pd.DataFrame, yr: int, sport: str, gdr: str) -> float: """ The function answers questions like the following : “What was the percentage of female basketball players among all the female participants of the 2016 Olympics?” Returns: float: Percentage of participants who played the given sport among the participants of the given gender. """ df = df[(df["Year"] == yr) & (df["Sex"] == gdr)] df = df[~df.duplicated(subset=["ID"])] df_res = df[df["Sport"] == sport] return (df_res.shape[0] / df.shape[0])
def validate(df:pd.DataFrame): '''Checks if the scraped data is valid and as expected.''' log("Running validity check") if df.empty: # End the program if no data was scraped log("Validity check failure: no data, program ended after " + str(pd.Timestamp.now() - start)) raise Exception("Validity check failure: empty dataframe") else: if df.isnull().values.any(): # Check for empty cells in the table log("Validity check: empty detected") if df.duplicated().any(): # Check for duplicate lines in the table log("Validity check: duplicates detected")
def test_duplicated_do_not_fail_on_wide_dataframes(): # gh-21524 # Given the wide dataframe with a lot of columns # with different (important!) values data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) for i in range(100)} df = DataFrame(data).T result = df.duplicated() # Then duplicates produce the bool Series as a result and don't fail during # calculation. Actual values doesn't matter here, though usually it's all # False in this case assert isinstance(result, Series) assert result.dtype == np.bool
def __add_duplicates_to_tree_and_remove_spurious_leaves( self, tree: nx.DiGraph, character_matrix: pd.DataFrame, node_name_generator: Generator[str, None, None], ) -> nx.DiGraph: """Append duplicates and prune spurious extant lineages from the tree. Places samples removed in removing duplicates in the tree as sisters to the corresponding cells that share the same mutations. If any extant nodes that are not in the original character matrix are present, they are removed and their lineages are pruned such that the remaining leaves match the set of samples in the character matrix. Args: tree: The tree after solving character_matrix: Character matrix Returns: The tree with duplicates added and spurious leaves pruned """ character_matrix.index.name = "index" duplicate_groups = (character_matrix[character_matrix.duplicated( keep=False) == True].reset_index().groupby( character_matrix.columns.tolist())["index"].agg( ["first", tuple]).set_index("first")["tuple"].to_dict()) for i in duplicate_groups: new_internal_node = next(node_name_generator) nx.relabel_nodes(tree, {i: new_internal_node}, copy=False) for duplicate in duplicate_groups[i]: tree.add_edge(new_internal_node, duplicate) # remove extant lineages that don't correspond to leaves to_drop = [] leaves = [n for n in tree if tree.out_degree(n) == 0] for l in leaves: if l not in character_matrix.index: to_drop.append(l) parent = [p for p in tree.predecessors(l)][0] while tree.out_degree(parent) < 2: to_drop.append(parent) parent = [p for p in tree.predecessors(parent)][0] tree.remove_nodes_from(to_drop) return tree
def clean_data(df: pd.DataFrame) -> pd.DataFrame: """ We want to one-hot encode the categories ultimately having each as its own column and each row with a 0/1 if that example belongs to the category. We needed to merge initially to ensure categories matched with messages, but now we form a new categories df from the merged df categories column, splitting the categories on ';', creating column names, and then retaining only the binary 0/1 in as the row values. We finally drop the categories column in the original df and concat our new one-hot encoded categories df with it. :param df: the dataframe from load_data() :return: A new pandas dataframe with our categories one-hot encoded. """ # We split the categories on the ; delimiter expanding into their own columns with expand=True categories = df.categories.str.split(';', expand=True) # Take the first row and remove the -N from each category to serve as column names row = categories.iloc[0, :] # use this row to extract a list of new column names for categories. category_column_names = row.apply(lambda x: x[:-2]).values categories.columns = category_column_names # For each column we want the row values to be just the binary numeric part 1/0 for column in categories: # set each value to be the last character of the string categories[column] = categories[column].str[-1] # convert column from string to numeric categories[column] = categories[column].apply(int) # Now we drop the categories from the merged df and concat with the # new categories frame (note axis=1 meaning concat on columns) df.drop(columns=['categories'], inplace=True) df = pd.concat([df, categories], axis=1) # Remove duplicates df.drop_duplicates(inplace=True) # Check we now have no duplicates assert df[df.duplicated()].shape[0] == 0 # Set 2 to 1 in the related column df.loc[df.related == 2] = 1 # Also child_alone always 0 so useless df.drop(columns=['child_alone'], inplace=True) return df
def _general_dict(self, scenario): """ Generate the meta-information that holds for all runs (scenario info etc) Parameters ---------- scenario: smac.Scenario scenario file to get information from """ # general stores information that holds for all runs, runspec holds information on a run-basis general = OrderedDict() if len(self.runscontainer.get_budgets()) > 1: general['# budgets'] = len(self.runscontainer.get_budgets()) if len(self.runscontainer.get_folders()) > 1: general['# parallel runs'] = len(self.runscontainer.get_folders()) # Scenario related general['# parameters'] = len(scenario.cs.get_hyperparameters()) general['Deterministic target algorithm'] = scenario.deterministic general['Optimized run objective'] = scenario.run_obj if scenario.cutoff or scenario.run_obj == 'runtime': general['Cutoff'] = scenario.cutoff if any([str(lim)!='inf' for lim in [scenario.wallclock_limit, scenario.ta_run_limit, scenario.algo_runs_timelimit]]): general['Walltime budget'] = scenario.wallclock_limit general['Runcount budget'] = scenario.ta_run_limit general['CPU budget'] = scenario.algo_runs_timelimit # Instances num_train, num_test = [len([i for i in insts if i]) for insts in [scenario.train_insts, scenario.test_insts]] if num_train > 0 or num_test > 0: general['# instances (train/test)'] = "{} / {}".format(num_train, num_test) # Features num_feats = scenario.n_features if scenario.feature_dict else 0 num_dup_feats = 0 if scenario.feature_dict: dup_feats = DataFrame(scenario.feature_array) num_dup_feats = len(dup_feats[dup_feats.duplicated()]) # only contains train instances if num_feats > 0: general['# features (duplicates)'] = "{} ({})".format(num_feats, num_dup_feats) general['----------'] = '----------' combined_run = self.runscontainer.get_aggregated(False, False)[0] combined_stats = self._stats_for_run(combined_run.original_runhistory, combined_run.scenario, combined_run.incumbent) for k, v in combined_stats.items(): general[k] = v return general
def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict: """General statistics for the DataFrame. Args: df: The DataFrame to describe. variable_stats: Previously calculated statistic on the DataFrame. Returns: A dictionary that contains the table statistics. """ n = len(df) memory_size = df.memory_usage(deep=config["memory_deep"].get(bool)).sum() record_size = float(memory_size) / n table_stats = { "n": n, "n_var": len(df.columns), "memory_size": memory_size, "record_size": record_size, "n_cells_missing": 0, "n_vars_with_missing": 0, "n_vars_all_missing": 0, } for series_summary in variable_stats.values(): if "n_missing" in series_summary and series_summary["n_missing"] > 0: table_stats["n_vars_with_missing"] += 1 table_stats["n_cells_missing"] += series_summary["n_missing"] if series_summary["n_missing"] == n: table_stats["n_vars_all_missing"] += 1 table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / ( table_stats["n"] * table_stats["n_var"]) supported_columns = [ k for k, v in variable_stats.items() if v["type"] != Unsupported ] table_stats["n_duplicates"] = (sum(df.duplicated( subset=supported_columns)) if len(supported_columns) > 0 else 0) table_stats["p_duplicates"] = ((table_stats["n_duplicates"] / len(df)) if (len(supported_columns) > 0 and len(df) > 0) else 0) # Variable type counts table_stats.update( {"types": dict(Counter([v["type"] for v in variable_stats.values()]))}) return table_stats
def summarize_dataframe(self, source: pd.DataFrame, name: str, target_dict: dict, skip: List[str]): target_dict["name"] = name target_dict["num_rows"] = len(source) target_dict["num_columns"] = len(source.columns) target_dict["num_skipped_columns"] = len(source.columns) - len( [x for x in source.columns if x not in skip]) target_dict["memory_total"] = source.memory_usage(index=True, deep=True).sum() target_dict["memory_single_row"] = \ float(target_dict["memory_total"]) / target_dict["num_rows"] target_dict["duplicates"] = NumWithPercent(sum(source.duplicated()), len(source))
def delete_duplicates(from_date=None, to_date=None, expenses: pd.DataFrame = None): if (not from_date or not to_date) and expenses.empty: expenses = get_expenses(from_date, to_date) expenses = expenses[expenses.duplicated(subset=['amount', 'date'], keep=False)] merged = expenses.merge(right=expenses, on=['amount', 'date']) merged: pd.DataFrame = merged[merged['id_x'] != merged['id_y']] merged = merged.iloc[::2, :] merged['score'] = merged.apply(similar, axis=1) duplicates = merged[merged['score'] > 0.6] logger.info( f'Duplicate entries: {"none" if duplicates.empty else duplicates}') duplicates.apply(delete_expense, axis=1)
def test_duplicated_subset(subset, keep): df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a'], 'C': [np.nan, 3, 3, None, np.nan]}) if subset is None: subset = list(df.columns) elif isinstance(subset, str): # need to have a DataFrame, not a Series # -> select columns with singleton list, not string subset = [subset] expected = df[subset].duplicated(keep=keep) result = df.duplicated(keep=keep, subset=subset) tm.assert_series_equal(result, expected)
def calculate(df: pd.DataFrame): is_duplicated = df.duplicated(subset=['ip', 'device', 'os', 'channel', 'app', 'click_time'], keep=False) features = np.zeros(len(df)) features[~is_duplicated] = np.nan curr_start_index = None prev_columns = None dup_df = df[is_duplicated] dup_rows = zip(dup_df.ip, dup_df.device, dup_df.os, dup_df.channel, dup_df.app, dup_df.click_time) for index, curr_columns in zip(dup_df.index, zip(dup_rows)): if prev_columns != curr_columns: curr_start_index = index features[index] = index - curr_start_index prev_columns = curr_columns df['DuplicateRowIndexDiff'] = features return df[['DuplicateRowIndexDiff']]
def __remove_duplicates__(df: pd.DataFrame, subset: Union[List[str], str] = None) -> pd.DataFrame: """Removes duplicated rows from `df` :param df: A pandas DataFrame :param subset: Column name/s to identify duplicates. If it is `None` all columns will be used :return: The DataFrame with no duplicated rows """ if subset is None: subset = df.columns else: Transform.__guard_against_non_existent_columns__(df, subset) if df.duplicated(subset).sum() > 0: df.drop_duplicates(subset, inplace=True, keep='last') return df
def dup_rows(data: pd.DataFrame, **kwargs) -> pd.DataFrame: """Get duplicate rows. Parameters ---------- data : DataFrame Data for getting duplicate rows. **kwargs : dict, optional Extra arguments to `DataFrame.duplicated`. Refer to Pandas documentation for all possible arguments. Returns ------- DataFrame Table of duplicate rows. """ return data.loc[data.duplicated(**kwargs)].copy()
def _general_dict(self, scenario, bohb_parallel=False): """ Generate the meta-information that holds for all runs (scenario info etc) Parameters ---------- scenario: smac.Scenario scenario file to get information from bohb_parallel: Union[False, int] if set, defines number of parallel runs """ # general stores information that holds for all runs, runspec holds information on a run-basis general = OrderedDict() # TODO with multiple BOHB-run-integration # overview['Run with best incumbent'] = os.path.basename(best_run.folder) #if num_conf_runs != 1: # overview['Number of configurator runs'] = num_conf_runs self.logger.debug("bohb_parallel in overview: %s", bohb_parallel) if bohb_parallel: general['# aggregated parallel BOHB runs'] = bohb_parallel # Scenario related general['# parameters'] = len(scenario.cs.get_hyperparameters()) general['Deterministic target algorithm'] = scenario.deterministic general['Optimized run objective'] = scenario.run_obj if scenario.cutoff or scenario.run_obj == 'runtime': general['Cutoff'] = scenario.cutoff if any([str(lim)!='inf' for lim in [scenario.wallclock_limit, scenario.ta_run_limit, scenario.algo_runs_timelimit]]): general['Walltime budget'] = scenario.wallclock_limit general['Runcount budget'] = scenario.ta_run_limit general['CPU budget'] = scenario.algo_runs_timelimit # Instances num_train, num_test = [len([i for i in insts if i]) for insts in [scenario.train_insts, scenario.test_insts]] if num_train > 0 or num_test > 0: general['# instances (train/test)'] = "{} / {}".format(num_train, num_test) # Features num_feats = scenario.n_features if scenario.feature_dict else 0 num_dup_feats = 0 if scenario.feature_dict: dup_feats = DataFrame(scenario.feature_array) num_dup_feats = len(dup_feats[dup_feats.duplicated()]) # only contains train instances if num_feats > 0: general['# features (duplicates)'] = "{} ({})".format(num_feats, num_dup_feats) return general
def extract_duplicates(data: DataFrame, duplicate_columns: List[str], index_columns: List[str], fill_na='NA') -> DataFrame: # without filling nulls we would get false negatives as nan != nan in Python if fill_na: data = data.fillna(fill_na) # pre-filter to operate only on duplicates data = data[duplicate_columns + index_columns] data = data[data.duplicated(keep=False, subset=duplicate_columns)] by_values = (data.reset_index().groupby(duplicate_columns) [index_columns].apply(to_nested_series)) if by_values.empty: return DataFrame() some_index_column = by_values[index_columns[0]] df = by_values[some_index_column.apply(len) > 1].reset_index(drop=True) return explode_rows_with_lists(df)
def rows(self): from pandas import read_pickle, DataFrame, merge, concat connection = self.output().connect() cursor = connection.cursor() sql = f""" SELECT {', '.join(['id'] + list(self.columns))} FROM {self.table}; """ cursor.execute(sql) results = cursor.fetchall() current_df = DataFrame(results, columns=['id'] + list(self.columns)) with self.input().open('r') as f: df = read_pickle(f, compression=None) if not df.empty: # get list of dim values that are already in the database, but have # changed their attributes merged = merge(current_df, df, on=self.columns, how='inner') current_df = concat([current_df, merged], axis=0) is_duplicate = current_df.duplicated(keep=False) # duplicates = current_df[is_duplicate] new = current_df[~is_duplicate] to_delete = new['id'].tolist() to_copy = df[df[list(self.id_cols)].isin(new[list( self.id_cols)].to_dict(orient='list')).all(axis=1)] to_copy = to_copy[list(self.columns)] delete_sql = f""" DELETE FROM {self.table} WHERE id IN ({', '.join(to_delete)}); """ cursor.execute(delete_sql) for index, line in to_copy.iterrows(): # returns (index, Series) tuple yield line.values.tolist()
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid): dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])] # print(dfm_duplicated) dfm_stk_info.drop_duplicates('股权登记日',inplace=True) for index, row in dfm_duplicated.iterrows(): dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度']) dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案'] if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E': if row['方案文本解析错误标识位'] == 'E': dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E' dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None dfm_stk_info.loc[index, '转增(股)/10股'] = None dfm_stk_info.loc[index, '送股(股)/10股'] = None else: dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股']) dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股']) dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股']) logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
def get_duplicates(df: pd.DataFrame, supported_columns) -> Optional[pd.DataFrame]: """Obtain the most occurring duplicate rows in the DataFrame. Args: df: the Pandas DataFrame. supported_columns: the columns to consider Returns: A subset of the DataFrame, ordered by occurrence. """ n_head = config["duplicates"]["head"].get(int) if n_head > 0 and supported_columns: return (df[df.duplicated( subset=supported_columns, keep=False)].groupby(supported_columns).size().reset_index( name="count").nlargest(n_head, "count")) return None
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]: """ Provides information on and drops duplicate rows. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame Returns ------- Tuple[pd.DataFrame, List] Deduplicated Pandas DataFrame and Index Object of rows dropped """ data = pd.DataFrame(data).copy() dupl_rows = data[data.duplicated()].index.tolist() data = data.drop(dupl_rows, axis="index") return data, dupl_rows
def get_duplicates( df: pd.DataFrame, supported_columns) -> Tuple[Dict[str, Any], Optional[pd.DataFrame]]: """Obtain the most occurring duplicate rows in the DataFrame. Args: df: the Pandas DataFrame. supported_columns: the columns to consider Returns: A subset of the DataFrame, ordered by occurrence. """ n_head = config["duplicates"]["head"].get(int) metrics: Dict[str, Any] = {} if n_head > 0: if supported_columns and len(df) > 0: duplicates_key = config["duplicates"]["key"].get(str) if duplicates_key in df.columns: raise ValueError( f"Duplicates key ({duplicates_key}) may not be part of the DataFrame. Either change the " f" column name in the DataFrame or change the 'duplicates.key' parameter." ) duplicated_rows = df.duplicated(subset=supported_columns, keep=False) duplicated_rows = (df[duplicated_rows].groupby( supported_columns).size().reset_index(name=duplicates_key)) metrics["n_duplicates"] = len(duplicated_rows[duplicates_key]) metrics["p_duplicates"] = metrics["n_duplicates"] / len(df) return ( metrics, duplicated_rows.nlargest(n_head, duplicates_key), ) else: metrics["n_duplicates"] = 0 metrics["p_duplicates"] = 0.0 return metrics, None else: return metrics, None
def load_recordings(self, databases): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') recordings = DataFrame() for database in databases: path = op.join(data_dir, 'KEYS', '{db}.key'.format(db=database)) local_keys = read_table(path, delim_whitespace=True, names=FIELDS) recordings = recordings.append(local_keys) # remove duplicates recordings = recordings[~recordings.duplicated()] # index using unique recording name recordings = recordings.set_index('recording') # translate channels (a --> 1, b --> 2, x --> 1) func = lambda channel: {'a': 1, 'b': 2, 'x': 1}[channel] recordings['channel'] = recordings['channel'].apply(func) return recordings
def project_state_assets(self, df: pd.DataFrame, table: SQLTable) -> pd.DataFrame: self._log.debug(f"Cleaning {whoami()}") if self.column_check(df.columns.tolist(), table): df = df.replace("-", 0) float_cols = get_column_types(table, Float) date_cols = get_column_types(table, Date) df = df.pipe(ccast, (float_cols, tf_net_acres)).pipe( ccast, (date_cols, tf_date)) pk = table.primary_key.columns.keys() duplicates = df.duplicated(pk) dup_idx = ", ".join( df[duplicates].index.astype("unicode").tolist()) if np.all(duplicates): message_box( f"Duplicate assets found at {dup_idx}. {', '.join(pk)} combination should be unique." ) return df
def split_jhu_state_data( df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: state = df[df.duplicated(['state'])] state = state.groupby('state').sum() state = state.reset_index() state.latitude = 0.0 state.longitude = 0.0 geocoder = Nominatim(timeout=60) lat, lon = zip(*[ pandemics.utils.geocode(geocoder, f'{state}, United States') for state in state.state ]) state.latitude = lat state.longitude = lon state = state.drop(columns=['fips']) return df, state
def find_by(df: pd.DataFrame, columns: List[str]) -> Result: """Compare items rows in `df` by `columns` Returns: Any duplicates """ result = Result(f"Duplicates") result.items_count = len(df) df = df.dropna(subset=columns, how="all") duplicates = df[df.duplicated(columns, keep=False)][columns] if duplicates.empty: return result errors = {} for _, d in duplicates.groupby(columns): msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns] errors[f"same {', '.join(msgs)}"] = list(d.index) result.add_error( f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}", errors=errors) return result
def fix_raw_results(raw_df: pd.DataFrame): """ Fixes some issues with the raw results """ # There's some redundancy with the raw results so we'll remove duplicated # results raw_df = raw_df[~raw_df.duplicated()] # There might also be some differences between run numbers between # experiments uniq_ids = raw_df.id.unique() n = len(uniq_ids) id_dfs = [pd.DataFrame()] * n for (i, id_val) in enumerate(uniq_ids): tmp_df = raw_df[raw_df['id'] == id_val].groupby('metric', as_index=False).mean() tmp_df['id'] = id_val id_dfs[i] = tmp_df df = pd.concat(id_dfs, ignore_index=True) df = df.reset_index(drop=True) return df
def clean_dict(df: DataFrame, column): print("Preprocessing data text - cleanining") #drop duplicates d = df.duplicated(column, keep='first') print("removing duplicate sentences. Duplicates = {} senteces".format( len(df[d][column]))) #lowercase df[column] = df[column].str.lower() print("len before removing: {}".format(len(df[column]))) df.drop_duplicates(subset=column, inplace=True, keep='first') print("len after removing: {}".format(len(df[column]))) stop = stopwords.words('english') for i, row in df.iterrows(): sentence = row[column] #remove http url sentence = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', sentence) #handle emojis before html sentence = convert_emojis(sentence) sentence = convert_emoticons(sentence) #remove html soup = BeautifulSoup(sentence, "html.parser") sentence = soup.get_text() #remove common stop words - might have to use it earlier because some stop words use symbols xes: isn't sentence = " ".join(x for x in sentence.split() if x not in stop) #remove punctuation sentence = sentence.translate(str.maketrans('', '', string.punctuation)) #stemming?lemmatize? sentence = lemmatize_words(sentence) #spelling correction? textblob or pyspellchecker - problem: slang and not english words will be transformed in other words. df.at[i, column] = sentence print(df[column].head()) return df
def clean_trades_df(df: pd.DataFrame) -> pd.DataFrame: # get origional number of ticks og_tick_count = df.shape[0] # drop irrgular trade conditions df = df.loc[df.irregular==False] # drop trades with >1sec timestamp diff dt_diff = (df.sip_dt - df.exchange_dt) df = df.loc[dt_diff < pd.to_timedelta(1, unit='S')] # add median filter and remove outlier trades df = median_outlier_filter(df) # remove duplicate trades num_dups = sum(df.duplicated(subset=['sip_dt', 'exchange_dt', 'sequence', 'trade_id', 'price', 'size'])) if num_dups > 0: print(num_dups, 'duplicated trade removed') df = df.drop_duplicates(subset=['sip_dt', 'exchange_dt', 'sequence', 'trade_id', 'price', 'size']) # drop trades with zero size/volume df = df.loc[df['size'] > 0] droped_rows = og_tick_count - df.shape[0] print('dropped', droped_rows, 'ticks (', round((droped_rows / og_tick_count) * 100, 2), '%)') # sort df df = df.sort_values(['sip_dt', 'exchange_dt', 'sequence']) # small cols subset df = df[['sip_dt', 'price', 'size']] return df.rename(columns={'sip_dt': 'date_time', 'size': 'volume'}).reset_index(drop=True)
def check_primary_key( df: pd.DataFrame, primaryKey: Union[str, List[str]], skip_required: bool = False, skip_single: bool = False, ) -> List[Union[ConstraintError, PrimaryKeyError]]: """ Check table primary key. Arguments: df: Table. primaryKey: Primary key field names. skip_required: Whether to not check for missing values in primary key fields. skip_single: Whether to not check for duplicates if primary key is one field. Returns: A list of errors. """ errors = [] key = _as_list(primaryKey) if key: if not skip_required: for name in key: errors += check_field_constraints(df[name], required=True, field=dict(name=name)) if skip_single and len(key) < 2: return errors invalid = df.duplicated(subset=key) if invalid.any(): errors.append( PrimaryKeyError( primaryKey=key, values=df[key][invalid].drop_duplicates().values.tolist(), )) return errors
pivoted = ldata.pivot('date', 'item') pivoted[:5] pivoted['value'][:5] unstacked = ldata.set_index(['date', 'item']).unstack('item') unstacked[:7] ###移除重复数据 data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) data data.duplicated() data.drop_duplicates() data['v1'] = range(7) data.drop_duplicates(['k1']) data.drop_duplicates(['k1', 'k2'], take_last=True) ###利用函数或映射进行数据转换 #1 data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
def test_drop_duplicates(): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1, 1, 2, 2, 2, 2, 1, 2], 'D': lrange(8)}) # single column result = df.drop_duplicates('AAA') expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B']) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep='last') expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything df2 = df.loc[:, ['AAA', 'B', 'C']] result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(['AAA', 'B']) tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep='last') expected = df2.drop_duplicates(['AAA', 'B'], keep='last') tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(['AAA', 'B'], keep=False) tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates('C') expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) df['E'] = df['C'].astype('int8') result = df.drop_duplicates('E') expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('E', keep='last') expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], 'y': [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) df = DataFrame([[1, 0], [0, 2]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-2, 0], [0, -4]]) tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = DataFrame([[-x, x], [0, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-x, x], [x, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0
# -*- coding: utf-8 -*- """ Created on Sun Sep 17 09:44:35 2017 @author: HanKin """ from pandas import Series, DataFrame data = DataFrame({'k': [1, 1, 2, 2],'y':[2,2,4,1]}) print(data) IsDuplicated = data.duplicated() print(IsDuplicated) print(type(IsDuplicated)) data = data.drop_duplicates() print(data)
f3 = pd.read_csv('rcs/macrodata.csv') periods = pd.PeriodIndex(year=f3.year, quarter=f3.quarter, name='date') f3 = DataFrame(f3.to_records(), columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D', 'end')) ldata = f3.stack().reset_index().rename(columns={0: 'value'}) wdata = ldata.pivot('date', 'item', 'value') # print ldata # print wdata data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) # 去除重复值 # data.duplicated()会返回一个Bool的Series,表示各行是否是重复值 s1 = data.duplicated() f4 = data.ix[np.logical_not(s1),] # print f4 # drop_dumplicates会直接返回一个除去重复值的DataFrame f5 = data.drop_duplicates() # print f5 # 指定按某列过滤,保留的值为最后一个 f6 = data.drop_duplicates(['k1'], keep='last') # print f6 # 给DataFrame添加一列,使其对应,map可以替换原Series对应的数据 data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
from pandas import Series, DataFrame import pandas as pd import numpy as np # DUPLICATED VALUES ------------------------- ## create new dataframe zip3 = zip(['red', 'green', 'blue', 'orange']*3, [5, 10, 20, 40]*3, [':(', ':D', ':D']*4) df3 = DataFrame(zip3, columns = ['A', 'B', 'C']) ## pandas method `duplicated` df3.duplicated() # searching from top to bottom by default df3.duplicated(take_last = True) # searches bottom to top ## subset duplicated values df3.duplicated(subset = ['A', 'B']) df3.duplicated(['A', 'B']) ## HOW to get all values that have duplicates (purging) t_b = df3.duplicated() b_t = df3.duplicated(take_last = True) unique = ~(t_b | b_t) # complement where either is true unique = ~t_b & ~b_t unique df3[unique] # DROPPING DUPLICATES -------------------------------------------- df3.drop_duplicates()
from pandas.io.parsers import TextParser from numpy import NaN as NA from lxml.html import parse from urllib.request import urlopen from lxml import objectify from io import StringIO ############################################################### data = DataFrame({'k1':['one'] * 3 + ['two'] * 4, 'k2':[1,1,2,3,3,4,4]}) print(data) print('\n') print(data.duplicated()) print('\n') print(data.drop_duplicates()) print('\n') data['v1'] = range(7) print(data.drop_duplicates(['k1'])) print('\n') print(data.drop_duplicates(['k1','k2'], take_last=True)) print('\n') data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'], 'ounces':[4,3,12,6,7.5,8,3,5,6]}) print(data)
# -*- coding: utf-8 -*- """""""""""""""""""""""""""""""""""""""""""""""""""""""""" 重复数据处理 """""""""""""""""""""""""""""""""""""""""""""""""""""""""" #%% import pandas as pd from pandas import Series,DataFrame from string import letters d1 = DataFrame({'a':['a','b']*6,'b':[1,2,3,4,5,6]*2,'c':[1,3,5]*4}) #%% 列出重复记录 d1.duplicated() #%% 选择非重复记录 d1[d1.duplicated()==False] #%% 按列计算重复 d1.duplicated('a') #%% 按两个以上列 d1.duplicated(['a','c']) #%% 保留最后一个元素 d1.duplicated('a',take_last=True) #%% 删除重复 # drop_duplicates() 等效于执行 d1[d1.duplicated()==False] d1.drop_duplicates()
def create_fip(temporary_store = None, year = None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format( fip['to_keep'].sum(), len(fip)) ) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"] ].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset = ['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def test_duplicated_nan_none(keep, expected): df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected)
b[-1] = np.nan # np中实现ifelse语句,a中空值位置用b替代 np.where(pd.isnull(a),b,a) # pd中类似函数,b中控制用a替代 b[:-2].combine_first(a[2:]) # DataFrame中使用 df1 = DataFrame({'a':[1.,np.nan,5.,np.nan], 'b':[np.nan,2.,np.nan, 6.], 'c':range(2,18,4)}) df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.], 'b':[np.nan,3.,4,6.,8.]}) df1.combine_first(df2) ## 移除重复数据 data = DataFrame({'k1':['one']*3+['two']*4, 'k2':[1,1,2,3,3,4,4]}) data.duplicated() # 去除重复值,默认留第一个 data.drop_duplicates() # 根据某一列去除重复值 data['v1'] = range(7) data.drop_duplicates(['k1']) # 保留最后一个 data.drop_duplicates(['k1','k2'], take_last=True) ## 利用函数或映射进行数据转换 data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami','honey ham','nova lox'], 'ounces':[4,3,12,6,7.5,8,3,5,6]}) meat_to_animal = { 'bacon':'pig', 'pulled pork':'pig',
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)}) df2 = DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)}) # dfMerged = pd.merge(df1, df2, on='key') # print dfMerged # dfMergedOuter = pd.merge(df1, df2, how='outer') # print dfMergedOuter df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)}) df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)}) # dfMerged = pd.merge(df3, df4, left_on='lkey', right_on='rkey') # print dfMerged left = DataFrame({'key1':['foo', 'foo', 'bar'], 'key2':['one', 'foo', 'one'], 'lval':[1, 2, 3]}) right = DataFrame({'key1':['foo', 'foo', 'bar', 'bar'], 'key2':['one', 'foo', 'one', 'one'], 'rval':[4, 5, 6, 7]}) dfMergedOuter = pd.merge(left, right, how='outer') # print dfMergedOuter arr = np.arange(12).reshape((6,2)) # print arr # print np.arange(12) arrConcat = np.concatenate([arr, arr], axis = 1) # print arrConcat data = DataFrame({'k1': ['one']*3 + ['two']*4, 'k2': [1,1,2,3,3,4,4]}) # print data dataDuplicate = data.duplicated() print dataDuplicate dropDuplicate = data.drop_duplicates() print dropDuplicate
def test_duplicated_keep(keep, expected): df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected)
match = regex_email.findall(email) df = DataFrame(match) # ----------------------------------------cours sur pandas # df tableau des élèves avec @mail # Dataframe.rename pour renommer les colonnes d'un DF df = df.rename (columns = {'ancien' : 'nouveau', 'ancien' : 'nouveau'}) # on peut aussi faire un lambda pour changer les index de lignes df.index = df.index.map(lambda x: 'Eleve ' + str(x)) # on peut supprimer les doublons df.duplicated() #pour voir s'il y a des doublons df.drop_duplicate() # pour les supprimer # -------------------------- traiter le fichier aliments.csv aliments = pd.read_csv('aliments.csv') # realiser une matrice d'aliments x traces contenues aliments['traces]'].isnull() aliments_with_traces = aliments['traces'].dropna() traces_iter = (set(x.split(',')) for x in aliments_with_traces['traces']) traces = set.union(traces_iter) DataFrame(np.zeros((len(aliments_with_traces), len(traces))), columns= traces) for i, tr in enumerate(aliments_with_traces.traces): dummies.ix[i, tr.split(',')] = 1
info = DataFrame(db, columns=info_keys) print info[:5] print info print pd.value_counts(info.group)[:10] nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print nutrients print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) print info col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) print nutrients ndata = pd.merge(nutrients, info, on='id', how='outer')
def create_fip(year = 2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len))/5 print "il ya a au maximum %s pac par foyer" %nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns) fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1,nb_pac_max+1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)] fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration','naia','type_pac']) # TODO: check if useful fip.set_index(["declaration","pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG),'/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')] pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1' , len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]] individec1 = individec1.rename(columns={'declar1':'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]] individec2.rename(columns={'declar2':'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf']<=15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi','ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100*fip['ident'] + fip['noidec'] fip['noindiv'] = 100*fip['ident'] + fip['noi'] fip['type_pac'] = 0 ; fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
nutrients = DataFrame(db[0]['nutrients']) print(nutrients[:7]) info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print(pd.value_counts(info.group)[:10]) nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print(nutrients[:10]) print(nutrients.duplicated().sum()) nutrients = nutrients.drop_duplicates() col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) print(info[:10]) col_mapping = { 'description': 'nutrient', 'group': 'nutgroup' } nutrients = nutrients.rename(columns=col_mapping, copy=False) print(nutrients[:10])
from pandas import Series, DataFrame import pandas as pd import numpy as np # DUPLICATED VALUES ----------------------------------- ## create a new data frame zip3 = zip(['red', 'green', 'blue', 'orange']*4, [5, 10, 20, 40]*3, [':(', ':D', ':D']*4) df3 = DataFrame(zip3, columns = ['A', 'B', 'C']) df3 ## returns boolean vector of duplicated rows of a whole DataFrame or subset using method `duplicated` ## IMPORTANT: python, by default, searches for duplicated values from top-to-bottom ## and will not mark a row as "duplicated" until it actually finds another instance df3.duplicated() # defaults using all rows searching top-to-bottom df3.duplicated(take_last = True) # option `take_last = True` searches bottom-to-top ## SUBSET duplicates # if we want the duplicated criteria to be of a subset, we can do that too df3.duplicated(subset = ['A', 'B']) df3.duplicated(['A', 'B']) # same as before ## HOW to get all values that have a duplicate t_b = df3.duplicated() b_t = df3.duplicated(take_last = True) unique = ~(t_b | b_t) # negate where either is true unique unique = ~t_b & ~b_t # same as above unique