def _cluster(self, df: pd.DataFrame, iteration: int) -> pd.DataFrame: """Assign cluster ids to projected parameters.""" self._log("Clustering projected parameter space...") clust = self.configuration.local_configurations[iteration].clusterer cluster_ids = clust(df[self.projection_headers].to_numpy()) df.update({self.cluster_header: cluster_ids}) return df
def test_update_raise_on_overlap(self): df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3], [1.5, nan, 3]]) other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2]) with pytest.raises(ValueError, match="Data overlaps"): df.update(other, errors='raise')
def build_similarity_matrix(self, fingerprint_matrices): if fingerprint_matrices == None: return None names = fingerprint_matrices.keys() similarity_matrix = DataFrame(index=names, columns=names) if self.sim_pickle_path is not None: if os.path.isfile(self.sim_pickle_path): print "Found pickled similarity matrix at '" + self.sim_pickle_path +"', importing..." with open(self.sim_pickle_path, 'rb') as sim_pickle: similarity_matrix.update(pickle.load(sim_pickle)) else: print "Warning: was asked to look for similarity matrix at '" + self.sim_pickle_path +"'" print "Couldn't find one -- new pickle file will be created." for name1, fp1 in fingerprint_matrices.iteritems(): for name2, fp2 in fingerprint_matrices.iteritems(): #print "Comparing: " + name1 + " and " + name2 if name1 == name2: similarity_matrix.loc[name1, name2] = -1 elif np.isnan(similarity_matrix.loc[name1, name2]): comparison_result = self.compare(fp1, fp2) similarity_measure = self.similarity_measure(comparison_result) similarity_matrix.loc[name1, name2] = similarity_measure similarity_matrix.loc[name2, name1] = similarity_measure return similarity_matrix
def update_listened_to_durations(playtracks_df: pd.DataFrame, current_timestamp_ms: int) -> pd.DataFrame: """ Given a populated pandas DataFrame containing extracted data from recently played tracks according to ListenerCommon#SCHEMA, calculate the time in milliseconds spent in listening to each track. Returns an updated pandas DataFrame with ListenerCommon#LISTENED_TIME filled in with results of the calculation. The returned dataframe is sorted in ascending order of timestamp of when the track was played. :param playtracks_df: The input pandas DataFrame :param current_timestamp_ms: The timestamp of the time when query was run :return: Output pandas DataFrame with calculated ListenerCommon#LISTENED_TIME, sorted by when track was played in ascending order """ playtracks_df = playtracks_df.sort_values(ListenerCommon.TIMESTAMP[0], ascending=False) listened_ms_list = [] more_recent_timestamp_ms = current_timestamp_ms for index, row in playtracks_df.iterrows(): track_duration_ms = TRACK_DURATION_MS[row[ListenerCommon.TRACK_ID[0]]] played_at_timestamp_ms = int(row[ListenerCommon.TIMESTAMP[0]]*1000) # If the time-gap between when this track was played and when next track was played is longer than the # track's duration, assume the entire track was listened to and use track-duration as value # Else, use time-gap between when this track was played and when next track was played if (more_recent_timestamp_ms - played_at_timestamp_ms) > track_duration_ms: listened_ms_list.append(ListenerCommon.LISTENED_TIME[1](track_duration_ms)) else: listened_ms_list.append(ListenerCommon.LISTENED_TIME[1](more_recent_timestamp_ms - played_at_timestamp_ms)) more_recent_timestamp_ms = played_at_timestamp_ms listened_ms_series = pd.Series(listened_ms_list, name=ListenerCommon.LISTENED_TIME[0], index=playtracks_df.index) playtracks_df.update(listened_ms_series) return playtracks_df.sort_values(ListenerCommon.TIMESTAMP[0], ascending=True)
def test_update_with_subset_str_dtype(self, string_dtype): # GH4094 df = DataFrame({"a": ["a", "b", "c"]}, dtype=string_dtype) update = df.copy()[:-1] expected = df.copy() df.update(update) assert df.a.dtype == expected.a.dtype
def test_update_with_subset_bool_dtype(self): # GH4094 df = DataFrame({"a": [True, False]}, dtype=bool) update = DataFrame({"a": [False]}, dtype=bool) expected = df.copy() df.update(update) assert df.a.dtype == expected.a.dtype
def apply(meta: dict, obj: Tuple[DataFrame], data: DataFrame) -> DataFrame: obj = obj[0].copy() data.update(obj) if meta['dtypes'] is not None: data = data.astype(meta['dtypes']) return data
def test_update_with_subset_and_same_not_nullable_dtype( self, any_real_dtype): # GH4094 df = DataFrame({"a": Series([1, 2, 3], dtype=any_real_dtype)}) update = df.copy()[:-1] df.update(update) assert df.a.dtype == any_real_dtype
def get_google_fit_steps(fname: Union[Path, str], data: pd.DataFrame) -> pd.DataFrame: """ This function updates a dataframe with the JSON data gathered from the GoogleFit API. It updates the values in the `googlefitsteps` column. Params: fname: path to data folder for participant X data: pandas data frame to store data """ directory = Path(fname) # find json file with GoogleFit data for child in directory.iterdir(): if child.suffix == ".json" and "GoogleFit" in child.stem: path_to_json = child # initiate interface to file json_interface = GoogleFitDataJSON(path_to_json) # return the extracted dataframe new_data = json_interface.df # update data data.update(new_data) return data
def gather_frame_fields(df: pd.DataFrame, other_df: pd.DataFrame, index_label: str = None, fields: list = None, copy_frames: bool = False, append_missing: bool = True, **kwargs): if copy_frames: df = df.copy() other_df = other_df.copy() if index_label is not None: for frame in [df, other_df]: if frame.index.name is not index_label and index_label in frame.columns: frame.set_index(index_label, drop=False, inplace=True) if fields: other_df_orig = other_df.copy() other_df = other_df.loc[:, fields] else: other_df_orig = other_df df.update(other_df, **kwargs) if append_missing is True: df_add = other_df_orig.loc[~other_df_orig.index.isin(df.index), :] df = pd.concat([df, df_add]) return df
def _interpolate_column(self, dataframe: DataFrame, column_id: str, time: float) -> DataFrame: person_df = pd.DataFrame(dataframe[column_id], index=dataframe.index) person_df = person_df.dropna() date_to_add, time_to_add = self._get_date_for_new_time( dataframe, column_id, time) # create new entry and add it new_df = pd.DataFrame([time_to_add], columns=[column_id], index=[date_to_add]) person_df = person_df.append(new_df) # interpolate person_df = utils.interpolate_dates(person_df) # modify /!\ IN PLACE /!\ using non-NA values from another DataFrame dataframe.update(person_df) # append new values, if any rows_to_append = person_df.loc[person_df.index.difference( dataframe.index, sort=False)] dataframe = dataframe.append(rows_to_append, sort=False) return dataframe
def test_update_raise(self): df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3], [1.5, nan, 3]]) other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2]) with tm.assert_raises_regex(ValueError, "Data overlaps"): df.update(other, raise_conflict=True)
def update_df(df: pd.DataFrame, new_df: pd.DataFrame, on: (str, list) = None, mode='update'): """ 根据某一列更新dataframe里的数据 :param df: 待升级的 :param new_df: 新表 :param on: 根据哪一列升级,默认为None,使用index :param mode:处理方式,update:直接更新对应位置的数值,insert:只有对应位置为空时才更新 :return: """ v1 = len(df) if on is not None: on = ensure_list(on) new_df = new_df.drop_duplicates() if any(new_df[on].duplicated()): raise ValueError('new_df中有重复的索引列对应不同的值,请检查') new_df = df[on].drop_duplicates().merge(new_df, how='inner', on=on) df = df.set_index(on, drop=False) new_df = new_df.set_index(on, drop=False) if mode == 'update': df.update(new_df) elif mode == 'insert': df = df.combine_first(new_df) else: raise ValueError(f'参数{mode}错误,可选参数为 update or insert') df = df.reset_index(drop=True) if on is not None: if v1 != len(df): raise ValueError('update后Dataframe结构发生变化,请检查') return df
def _pre_process(data: pd.DataFrame) -> pd.DataFrame: ymca_columns = [ column for column in data.columns if column.startswith(C.COL_YMCA_PREFIX) ] data.update(_round_ymca_columns(data, ymca_columns)) return data
def merge_users_and_ratings(self, users: pd.DataFrame, ratings: pd.DataFrame): users = users.set_index('user_id') ratings = ratings.set_index('user_id') ratings['age'] = np.nan ratings['location'] = np.nan ratings.update(users) return ratings
def fillna_all_by_str(data: pd.DataFrame, value="unk"): strs = [] for i in data.columns: #print(data[i].dtype) if data[i].dtype == object: strs.append(i) print("fill cols:" + str(strs) + "\nvalues:" + value) data.update(data[strs].fillna(value)) # 回写
def get_sbu( df: pd.DataFrame, project: str, start: Union[None, str, int] = None, end: Union[None, str, int] = None, ) -> None: """Acquire the SBU usage for each account in the :attr:`pandas.DataFrame.index`. The start and end of the reported interval can, optionally, be altered with **start** and **end**. Performs an inplace update of **df**, adding new columns to hold the SBU usage per month under the ``"Month'`` super-column. In addition, a single row and column is added (``"sum"``) with SBU usage summed over the entire interval and over all users, respectively. Parameters ---------- df : :class:`pandas.DataFrame` A Pandas DataFrame with usernames and information, constructed by :func:`yaml_to_pandas`. :attr:`pandas.DataFrame.columns` and :attr:`pandas.DataFrame.index` should be instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively. User accounts are expected to be stored in :attr:`pandas.DataFrame.index`. SBU usage (including the sum) is stored in the ``"Month"`` super-column. start : :class:`int` or :class:`str`, optional Optional: The starting year of the interval. Defaults to the current year if ``None``. end : :class:`str` or :class:`int`, optional Optional: The final year of the interval. Defaults to current year + 1 if ``None``. project : :class:`str`, optional Optional: The project code of the project of interest. If not ``None``, only SBUs expended under this project are considered. """ # Construct new columns in **df** sy, ey = get_date_range(start, end) date_range = _get_datetimeindex(sy, ey) for i in date_range: df[('Month', str(i)[:7])] = np.nan df_tmp = parse_accuse(project, sy, ey) df.update(df_tmp) # Calculate SBU sums SUM = ('Month', 'sum') df[SUM] = df['Month'].sum(axis=1) df.loc['sum'] = np.nan df.loc['sum', 'Month'] = df['Month'].sum(axis=0).values df.at['sum', PROJECT] = 'sum' df.at['sum', SBU_REQUESTED] = _get_total_sbu_requested(df) # Mark all active users df[ACTIVE] = False df.loc[df[SUM] > 1.0, ACTIVE] = True
def test_update_modify_view(self): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) result_view = df2[:] df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(result_view, expected)
def _project(self, df: pd.DataFrame, iteration: int) -> pd.DataFrame: """Project parameter space down to a 2D space.""" self._log("Projecting parameter space down to 2D...") proj = self.configuration.local_configurations[iteration].projector proj_arr = proj(df[self.parameter_headers].to_numpy()) df.update({ pn: proj_arr[:, i] for i, pn in enumerate(self.projection_headers) }) return df
def test_update_raise(self): df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3], [1.5, nan, 3]]) other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2]) with assertRaisesRegexp(ValueError, "Data overlaps"): df.update(other, raise_conflict=True)
def sanitize_exchange(self, beta, exchange): benchmark = DataFrame().reindex_like(beta) benchmark.update(self._BENCHMARKS[exchange]) benchmark.fillna(method='ffill', inplace=True) benchmark.fillna(method='bfill', inplace=True) benchmark.dropna(inplace=True) benchmark = benchmark.transpose().copy() benchmark.replace(to_replace=0, method='ffill', inplace=True) benchmark = benchmark.transpose().copy() benchmark.dropna(inplace=True) return benchmark.copy()
def correct_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: assert df is not None assert len(df) > 2 df_state_end = df.columns[df.columns.str.contains( self.statecol_pattern)] if self.interpolate: df_state_end.fillna(method='pad', inplace=True) df.update(df_state_end) return df
def test_update_dtypes(self): # gh 3016 df = DataFrame([[1., 2., False, True], [4., 5., True, False]], columns=['A', 'B', 'bool1', 'bool2']) other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) df.update(other) expected = DataFrame([[45., 45., False, True], [4., 5., True, False]], columns=['A', 'B', 'bool1', 'bool2']) assert_frame_equal(df, expected)
def add_attribute_detail(view_part: pd.DataFrame): attributes = view_part.ATTR_PRD_WHL_VAL.str.split(',', expand=True) if attributes.shape[1] != 4: paste_leftover = attributes.iloc[:, 3:].apply(lambda x: ' '.join(x[~pd.isnull(x)].values), axis=1) attributes.loc[:, 3] = paste_leftover attributes = attributes.loc[:, :3] assert attributes.shape[1] == 4, 'attribute shape over 4' attributes = attributes.loc[:, [2, 3]].rename(columns={2: 'SIZE2', 3: 'SIZE3'}) assert attributes.shape[1] == 2 view_part.update(attributes) return view_part
def fix_dates(dataframe:pd.DataFrame, dates_column:str): """Fix wrong dates in the PatentsView database Some (grant and application) dates on PatentsView are wrongly reported and cannot be converted into proper dates. However, if you look on the PatentsView website, most (all?) are correct. Therefore, this module uses the PatentsView APIs to retrieve the correct dates (or, as a second-best, it tries to fix them with a simple heuristic) """ # Use the PatentsView API to fix those application dates # that cannot be coerced into proper dates dataframe['date_'] = pd.to_datetime( dataframe[dates_column], errors='coerce') dataframe.sort_values(by=['patent_id','date_'], inplace=True) dataframe.set_index('patent_id', inplace=True) patents_to_fix = ','.join([f'{{"patent_number":"{patent_id}"}}' \ for patent_id in dataframe[dataframe.date_.isna()].index]) patents_to_fix_n = sum(dataframe.date_.isna()) if patents_to_fix_n>0: query = ''.join([ 'https://api.patentsview.org/patents/query?q={"_or":[', patents_to_fix, ']}&f=["patent_number","patent_date"]&o={"per_page":', str(patents_to_fix_n), '}']) response = requests.get(query) df_fix = pd.DataFrame(response.json()['patents'], dtype=str) df_fix.rename(columns={ 'patent_number':'patent_id', 'patent_date':dates_column}, inplace=True) df_fix.sort_values(by=['patent_id',dates_column], inplace=True) df_fix.set_index('patent_id', inplace=True) dataframe.update(df_fix) dataframe.drop(columns='date_', inplace=True) dataframe.reset_index(inplace=True) dataframe.sort_values(by=['patent_id',dates_column], inplace=True) # At this point, all the mistakes should have been fixed # Anyhow, the script will fix dates that are possibly still wrong # applying some heuristic with the best guesses we can do, # given the information provided # Fix any date that has "00" as day, putting "01" inplace subset = dataframe[dates_column].str.endswith('00') if len(subset)>0: dataframe.loc[ subset,dates_column] = dataframe.loc[ subset,dates_column].str[:-2] + '01' # Fix any date that who's year doesn't start with "19" or "20", # putting "19" inplace subset = dataframe[dates_column].str[:2].isin(['19','20']) dataframe.loc[ ~subset,dates_column] = '19' + dataframe.loc[ ~subset,dates_column].str[2:] return dataframe
def test_update_nooverwrite(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], [1.5, nan, 3], [1.5, nan, 3]]) other = DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, overwrite=False) expected = DataFrame([[1.5, nan, 3], [1.5, 2, 3], [1.5, nan, 3], [1.5, nan, 3.]]) assert_frame_equal(df, expected)
def test_update(self): df = DataFrame([[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]) other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) expected = DataFrame([[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]) tm.assert_frame_equal(df, expected)
def test_update_filtered(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], [1.5, nan, 3], [1.5, nan, 3]]) other = DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, filter_func=lambda x: x > 2) expected = DataFrame([[1.5, nan, 3], [1.5, nan, 3], [1.5, nan, 3], [1.5, nan, 7.]]) assert_frame_equal(df, expected)
def interpolate(self, predictor, freq): idx = pd.period_range(min(self.historical_data.index), max(self.historical_data.index), freq=freq) result = DataFrame(index=idx.to_timestamp()) result['Temp1'] = np.NaN result.update(self.historical_data) X = result['Temp1'] for i in range(result.__len__()): if np.isnan(X[ result.index[i] ]): prediction = predictor.predict( result[:i], interval=timedelta(minutes=5) ) X[result.index[i]] = prediction #predictor.predict() return X
def retrieve_zij_counts_index(self): zij = self.g_props_to_df() common_index = list(set(self.prop_counts.index) | set(zij.index)) df_tot = DataFrame(nan, columns=common_index, index=common_index) df_tot.update(zij) zij = df_tot.fillna(0.) vc_tot = Series(nan, index=common_index) vc_tot.update(self.prop_counts) vc_tot = vc_tot.fillna(0.) freqs = vc_tot / vc_tot.sum() return zij.values, freqs.values, common_index
def add_to_col(df_to_update: pd.DataFrame, to_add: pd.Series, col_to_update_name): """ FR : Ajoute les valeurs de to_add à la colonne col_to_update_name de la DataFrae df_to_update. EN : Add the values of to_add to the col_to_update_name column of the df_to_update DataFrame """ df_to_update.update( pd.DataFrame({ col_to_update_name: pd.concat([df_to_update[[col_to_update_name]], to_add], axis=1, sort=True).fillna(0).sum(axis=1) }))
class InfoTable(DataFrameWidget): def __init__(self, samples=None): self.initVars() super(InfoTable, self).__init__(self.table) def initVars(self): """Initialises variables.""" self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"] self.table = DataFrame(columns=self.columns) ######################################################################## def update(self): plateID = self.table["Plate ID"] plateName = self.table["Plate Name"] plateKea = self.table["Plate Kea"] well = self.table["Well"] self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1) self.table.insert(0, "Plate ID", plateID) self.table.insert(1, "Plate Name", plateName) self.table.insert(2, "Plate Kea", plateKea) self.table.insert(3, "Well", well) self.setDataFrame(self.table) def append(self, appendage): self.table = self.table.append(appendage, ignore_index=True) self.update() def editPlates(self, edits): self.table = self.table.set_index("Plate ID") edits = edits.set_index("ID") self.table.update(edits) self.table = self.table.reset_index() def importPlateData(self, plateData, key): plateData = plateData.set_index(key) self.table = self.table.set_index(key) self.table.update(plateData) self.table = self.table.reset_index() def importSampleData(self, sampleData, tableKey, importKey): sampleData[tableKey] = sampleData[importKey] sampleData = sampleData.set_index(tableKey) self.table = self.table.set_index(tableKey) self.table = self.table.join(sampleData, rsuffix="_new") self.table = self.table.reset_index() def getKeaSexTestingData(self): table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]] table = table.set_index(["Plate ID", "Well"]) table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True) return table
def test_update_dtypes(self): # gh 3016 df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], columns=["A", "B", "bool1", "bool2"], ) other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) df.update(other) expected = DataFrame( [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], columns=["A", "B", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected)
def test_update_nan(self): # #15593 #15617 # test 1 df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) df2 = DataFrame({'A': [None, 2, 3]}) expected = df1.copy() df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) # test 2 df1 = DataFrame({'A': [1.0, None, 3], 'B': date_range('2000', periods=3)}) df2 = DataFrame({'A': [None, 2, 3]}) expected = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected)
def test_update_from_non_df(self): d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} df = DataFrame(d) d['a'] = Series([5, 6, 7, 8]) df.update(d) expected = DataFrame(d) assert_frame_equal(df, expected) d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]} df = DataFrame(d) d['a'] = [5, 6, 7, 8] df.update(d) expected = DataFrame(d) assert_frame_equal(df, expected)
def test_update_datetime_tz(self): # GH 25807 result = DataFrame([pd.Timestamp('2019', tz='UTC')]) result.update(result) expected = DataFrame([pd.Timestamp('2019', tz='UTC')]) assert_frame_equal(result, expected)
def test_update_deprecation(self, raise_conflict): df = DataFrame([[1.5, 1, 3.]]) other = DataFrame() with tm.assert_produces_warning(FutureWarning): df.update(other, raise_conflict=raise_conflict)
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): df = DataFrame([[1.5, 1, 3.]]) with pytest.raises(exception, match=msg): df.update(df, **bad_kwarg)
def create_fip(temporary_store = None, year = None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format( fip['to_keep'].sum(), len(fip)) ) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"] ].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset = ['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two']) original.reindex_like(different) original.reindex_axis(['two','one'], axis = 1) left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two']) right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three']) left.merge(right,on='one') # Same as how='inner' left.merge(right,on='one', how='left') left.merge(right,on='one', how='right') left.merge(right,on='one', how='outer') left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two']) left right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3]) right left.update(right) # Updates values in left left subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']] subset.head() grouped_data = subset.groupby(by='region') grouped_data.groups # Lists group names and index labels for group membership grouped_data.mean() # Same as a pivot table subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','gdp_growth_2011','gdp_growth_2012']] subset.index = state_gdp['state_code'].values subset.head() subset.apply(mean) # Same as subset.mean() subset.apply(mean, axis=1).head() # Same as subset.mean(axis=1) subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']]
class ResultsTable(DataFrameWidget): '''The Class implementing the table in the Results tab of fyd2.''' def __init__(self,samples=None): self.initVars() super(ResultsTable,self).__init__(self.table) def initVars(self): '''Initialises variables.''' self.columns = ['Plate ID','Plate Name','Plate Kea','Well','Population', 'Crop','Experiment','LC Well','Result','Group', 'Exists','Grind','Concentration','Include'] self.table = DataFrame(columns=self.columns) ######################################################################## def update(self): '''Resets the booleans to booleans and reorders the columns.''' self.table['Exists'] = self.table['Exists'].map(lambda e: bool(e)) self.table['Grind'] = self.table['Grind'].map(lambda g: bool(g)) self.table['Include'] = self.table['Include'].map(lambda g: bool(g)) self.table = self.table[self.columns] self.setDataFrame(self.table) def append(self,appendage): '''Append the samples in appendage to the table.''' self.table = self.table.append(appendage,ignore_index=True) def editPlates(self,edits): '''Finds plates by Plate ID and edits data. Used in the 'Edit Data' menu item.''' self.table = self.table.set_index('Plate ID') edits = edits.set_index('ID') self.table.update(edits) self.table = self.table.reset_index() ######################################################################## def importPlateData(self,plateData,key): '''Updates the results table with the data read from the Plates Records spreadsheet.''' plateData = plateData.set_index(key) self.table = self.table.set_index(key) self.table.update(plateData) self.table = self.table.reset_index() plateData = plateData.reset_index() self.importPlateDataNonSamples(plateData) self.importPlateDataBadGrinds(plateData) def importPlateDataNonSamples(self,plateData): '''Updates the non-existing sample column from plateData, which has been read from the Plates Records spreadsheet.''' plateData = plateData[['Plate ID','Non-harvested plants']] exists = plateData.dropna(how='all',subset=['Non-harvested plants']) exists = DataFrame(exists['Non-harvested plants'].str.split(' ').tolist(),index=exists['Plate ID']).stack() exists = exists.reset_index().drop('level_1',1) exists.columns = ['Plate ID','Well'] exists['Exists'] = False exists = exists.set_index(['Plate ID','Well']) self.table = self.table.set_index(['Plate ID','Well']) self.table.update(exists) self.table = self.table.reset_index() def importPlateDataBadGrinds(self,plateData): '''Updates the Bad Grinds column from plateData, which has been read from the Plates Records spreadsheet.''' plateData = plateData[['Plate ID','Bad Grinds']] badGrinds = plateData.dropna(how='all',subset=['Bad Grinds']) badGrinds = DataFrame(badGrinds['Bad Grinds'].str.split(' ').tolist(),index=badGrinds['Plate ID']).stack() badGrinds = badGrinds.reset_index().drop('level_1',1) badGrinds.columns = ['Plate ID','Well'] badGrinds['Grind'] = False badGrinds = badGrinds.set_index(['Plate ID','Well']) self.table = self.table.set_index(['Plate ID','Well']) self.table.update(badGrinds) self.table = self.table.reset_index() ######################################################################## def setCrop(self,crop): '''Sets the item in the crop menu for all samples.''' self.table['Crop'] = crop def addLCFiles(self,fileDataLists): '''Adds lightcyler results for given fileDataLists.''' lc = DataFrame(columns=['LC Well','Result','Experiment','Plate','Well']) for fDL in fileDataLists: name,plate,exp,robot,pos= fDL lcFrame = read_table(name,sep='\t',header=1) lcFrame = lcFrame.drop(['Include','Color','Name','Status'],1) lcFrame['Experiment'] = exp lcFrame['Plate'] = plate lcFrame['Well'] = lcFrame['Pos'].map(lambda x: convert[robot][pos][x]) lcFrame['Group'] = lcFrame['Group'].astype(str) lcFrame.columns = ['LC Well','Result','Experiment','Plate','Well',] lc = lc.append(lcFrame) lc = lc.set_index(['Plate','Well']) self.table = self.table.set_index(['Plate ID','Well']) self.table.update(lc) self.table = self.table.reset_index() def addTaqFiles(self,fileDataLists): '''Adds Taqman results for given fileDataLists.''' lc = DataFrame(columns=['LC Well','Result','Experiment','Plate','Well']) for fDL in fileDataLists: name,plate,exp,robot,pos= fDL lcFrame = read_table(name,sep='\t',header=1) lcFrame = lcFrame.drop(['Include','Color','465-510','618-660','Score','Status'],1) lcFrame['Experiment'] = exp lcFrame['Plate'] = plate lcFrame['Well'] = lcFrame['Pos'].map(lambda x: convert[robot][pos][x]) lcFrame['Call'] = lcFrame['Call'].astype(str) lcFrame = lcFrame[['Pos','Call','Experiment','Plate','Well']] lcFrame.columns = ['LC Well','Result','Experiment','Plate','Well',] lc = lc.append(lcFrame) lc = lc.set_index(['Plate','Well']) self.table = self.table.set_index(['Plate ID','Well']) self.table.update(lc) self.table = self.table.reset_index() ######################################################################## def negativiseUnknowns(self): '''Sets all Unknown results to Negative. Used for Brassica/Ryegrass.''' self.table['Result'] = self.table['Result'].map(lambda res: 'Negative' if res == 'Unknown' else res) def setNonExistsToNegative(self): '''Reads the Exists column, and if it is False, sets the corresponding item in the Group column to Negative.''' self.table['Group'] = self.table.apply(lambda x: x['Group'] if x['Exists'] else 'Negative',1) def setNonExistsToNoSample(self): '''Reads the Exists column, and if it is False, sets the corresponding item in the Group column to No Sample.''' self.table['Group'] = self.table.apply(lambda x: x['Group'] if x['Exists'] else 'No sample',1) def includeAll(self): '''Sets the Include column to True for all samples.''' self.table['Include'] = True def excludeFailGrinds(self): '''Sets the Include column to False for failed grinds.''' self.table['Include'] = self.table.apply(lambda x: False if not(x['Grind']) else x['Include'],1) def excludeNegativeFailGrinds(self): '''Sets the Include column to False for failed grinds whose Group is Negative.''' self.table['Include'] = self.table.apply(lambda x: False if not(x['Grind']) and x['Group']=='Negative' else x['Include'],1) def excludeNonExists(self): '''Sets the Include column to False for samples that don't exist.''' self.table['Include'] = self.table.apply(lambda x: False if not(x['Exists']) else x['Include'],1) def missingResults(self): '''Determines if there are missing entries in Results or Groups''' return self.table['Result'].isnull().any(), self.table['Group'].isnull().any() ######################################################################## def getPopulations(self): '''Returns a list of all populations in Population column.''' return unique(self.table.Population.ravel()) def getGroups(self): '''Returns a list of all groups in Group column.''' return unique(self.table.Group.ravel()) def getExperiments(self): '''Returns a list of all experiments in Experiment column.''' return unique(self.table.Experiment.ravel()) def getCherriesByPop(self,cherryData): '''Gets cherrypicking data for passed populations/results.''' cherries = DataFrame(columns=['Source plate','Dest plate', 'Source Position','Source Well', 'Destination Position','Destination Well', 'Volume (ul)','Run']) for pop,groups,samples in cherryData: cherry = self.table[['Plate ID','Well','Population','Group']] cherry = cherry.loc[(cherry.Population==pop) & (cherry.Group.isin(groups))] cherry = cherry.reset_index().head(samples)[['Plate ID','Well']] cherry.columns = ['Source plate','Source Well'] cherries = cherries.append(cherry) cherries = self.fillCherryData(cherries) return cherries def getCherriesNU(self): '''Gets cherrypicking data for Negatives and Unknowns.''' cherries = DataFrame(columns=['Source plate','Dest plate', 'Source Position','Source Well', 'Destination Position','Destination Well', 'Volume (ul)','Run']) cherry = self.table[['Plate ID','Well','Group']] cherry = cherry.loc[cherry.Group.isin(['Negative','Unknown'])] cherry = cherry.reset_index()[['Plate ID','Well']] cherry.columns = ['Source plate','Source Well'] cherries = cherries.append(cherry) cherries = self.fillCherryData(cherries) return cherries def fillCherryData(self,cherries): '''Fills out the cherrypicking datatables, with Source Position, Destination Position, Destination Well, Run, and Volume.''' controls = 2 plateNames = 'CP' sources = [4,5,7,8] dests = [10,11] wellsList = [l+str(n) for n in range(1,13) for l in 'ABCDEFGH'][controls:] wells = len(wellsList) rows = len(cherries) cherries['Volume (ul)'] = 50 cherries['Destination Well']= wellsList * (rows/wells) + wellsList[:rows%wells] cherries['Dest plate'] = [plateNames + str(i/wells+1) for i in range(rows)] sourcePlates = unique(cherries['Source plate'].ravel()) destPlates = unique(cherries['Dest plate'].ravel()) sourceDict = {i: j for i,j in itertools.izip(sourcePlates,itertools.cycle(sources))} destDict = {i: j for i,j in itertools.izip(destPlates,itertools.cycle(dests))} cherries['Source Position'] = cherries['Source plate'].apply(lambda x: sourceDict[x],1) cherries['Destination Position'] = cherries['Dest plate'].apply(lambda x: destDict[x],1) self.run = 1 self.sources = [] self.dests = [] Run = [] for row in cherries.itertuples(): source = row[5] dest = row[2] if not self.sources or source != self.sources[-1]: self.sources.append(source) if not self.dests or dest != self.dests[-1]: self.dests.append(dest) if len(self.sources) > 4 or len(self.dests) > 2: self.run = self.run + 1 self.sources = [source] self.dests = [dest] Run.append('Run ' + format(self.run,'03d')) cherries['Run'] = Run cherries['Source Position'] = cherries['Source Position'].apply(lambda x: 'P'+str(x)) cherries['Destination Position'] = cherries['Destination Position'].apply(lambda x: 'P'+str(x)) cherries = cherries[['Source plate','Dest plate','Source Position','Source Well', 'Destination Position','Destination Well','Volume (ul)','Run']] return cherries ######################################################################## def getKeaSexTestingData(self): '''Gets the data required by the Kea Sex testing process run. Some conversion is required.''' kea = {'Male': 'M', 'Female': 'F', '1': '1', '2': '2', '3': '3', '4': '4', 'Negative': 'U', 'Unknown': 'U', 'No Sample':'U',} data = ['Plate ID','Plate Kea','Well','Experiment','Group'] table = self.table[data] table['Group'] = table['Group'].apply(lambda x: kea.get(x,'U')) table.rename(columns={'Plate Kea': 'Plate', 'Experiment': 'Slipstream Expt No', 'Plant Alt Names': 'Plant AltName', 'Group': 'Sex Marker Results'}, inplace=True) table = table.set_index(['Plate ID','Well']) return table
def create_fip(year = 2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len))/5 print "il ya a au maximum %s pac par foyer" %nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns) fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1,nb_pac_max+1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)] fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration','naia','type_pac']) # TODO: check if useful fip.set_index(["declaration","pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG),'/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')] pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1' , len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]] individec1 = individec1.rename(columns={'declar1':'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]] individec2.rename(columns={'declar2':'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf']<=15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi','ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100*fip['ident'] + fip['noidec'] fip['noindiv'] = 100*fip['ident'] + fip['noi'] fip['type_pac'] = 0 ; fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'