示例#1
0
 def _cluster(self, df: pd.DataFrame, iteration: int) -> pd.DataFrame:
     """Assign cluster ids to projected parameters."""
     self._log("Clustering projected parameter space...")
     clust = self.configuration.local_configurations[iteration].clusterer
     cluster_ids = clust(df[self.projection_headers].to_numpy())
     df.update({self.cluster_header: cluster_ids})
     return df
示例#2
0
    def test_update_raise_on_overlap(self):
        df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2])
        with pytest.raises(ValueError, match="Data overlaps"):
            df.update(other, errors='raise')
示例#3
0
    def build_similarity_matrix(self, fingerprint_matrices):
        if fingerprint_matrices == None:
            return None
        names = fingerprint_matrices.keys()
        similarity_matrix = DataFrame(index=names, columns=names)

        if self.sim_pickle_path is not None:
            if os.path.isfile(self.sim_pickle_path):
                print "Found pickled similarity matrix at '" + self.sim_pickle_path +"', importing..."
                with open(self.sim_pickle_path, 'rb') as sim_pickle:
                    similarity_matrix.update(pickle.load(sim_pickle))
            else:
                print "Warning: was asked to look for similarity matrix at '" + self.sim_pickle_path +"'"
                print "Couldn't find one -- new pickle file will be created."

        for name1, fp1 in fingerprint_matrices.iteritems():
            for name2, fp2 in fingerprint_matrices.iteritems():
                #print "Comparing: " + name1 + " and " + name2
                if name1 == name2:
                    similarity_matrix.loc[name1, name2] = -1
                elif np.isnan(similarity_matrix.loc[name1, name2]):      
                    comparison_result = self.compare(fp1, fp2)
                    similarity_measure = self.similarity_measure(comparison_result)
                    similarity_matrix.loc[name1, name2] = similarity_measure
                    similarity_matrix.loc[name2, name1] = similarity_measure

        return similarity_matrix
示例#4
0
def update_listened_to_durations(playtracks_df: pd.DataFrame, current_timestamp_ms: int) -> pd.DataFrame:
    """
    Given a populated pandas DataFrame containing extracted data from recently played tracks according to
    ListenerCommon#SCHEMA, calculate the time in milliseconds spent in listening to each track.
    Returns an updated pandas DataFrame with ListenerCommon#LISTENED_TIME filled in with results of the calculation.
    The returned dataframe is sorted in ascending order of timestamp of when the track was played.
    :param playtracks_df: The input pandas DataFrame
    :param current_timestamp_ms: The timestamp of the time when query was run
    :return: Output pandas DataFrame with calculated ListenerCommon#LISTENED_TIME, sorted by when track was played in
    ascending order
    """
    playtracks_df = playtracks_df.sort_values(ListenerCommon.TIMESTAMP[0], ascending=False)

    listened_ms_list = []
    more_recent_timestamp_ms = current_timestamp_ms

    for index, row in playtracks_df.iterrows():
        track_duration_ms = TRACK_DURATION_MS[row[ListenerCommon.TRACK_ID[0]]]
        played_at_timestamp_ms = int(row[ListenerCommon.TIMESTAMP[0]]*1000)

        # If the time-gap between when this track was played and when next track was played is longer than the
        # track's duration, assume the entire track was listened to and use track-duration as value
        # Else, use time-gap between when this track was played and when next track was played
        if (more_recent_timestamp_ms - played_at_timestamp_ms) > track_duration_ms:
            listened_ms_list.append(ListenerCommon.LISTENED_TIME[1](track_duration_ms))
        else:
            listened_ms_list.append(ListenerCommon.LISTENED_TIME[1](more_recent_timestamp_ms - played_at_timestamp_ms))

        more_recent_timestamp_ms = played_at_timestamp_ms

    listened_ms_series = pd.Series(listened_ms_list, name=ListenerCommon.LISTENED_TIME[0], index=playtracks_df.index)
    playtracks_df.update(listened_ms_series)

    return playtracks_df.sort_values(ListenerCommon.TIMESTAMP[0], ascending=True)
示例#5
0
 def test_update_with_subset_str_dtype(self, string_dtype):
     # GH4094
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=string_dtype)
     update = df.copy()[:-1]
     expected = df.copy()
     df.update(update)
     assert df.a.dtype == expected.a.dtype
示例#6
0
 def test_update_with_subset_bool_dtype(self):
     # GH4094
     df = DataFrame({"a": [True, False]}, dtype=bool)
     update = DataFrame({"a": [False]}, dtype=bool)
     expected = df.copy()
     df.update(update)
     assert df.a.dtype == expected.a.dtype
示例#7
0
 def apply(meta: dict, obj: Tuple[DataFrame], data: DataFrame) -> DataFrame:
     obj = obj[0].copy()
     data.update(obj)
     if meta['dtypes'] is not None:
         data = data.astype(meta['dtypes'])
     
     return data
示例#8
0
 def test_update_with_subset_and_same_not_nullable_dtype(
         self, any_real_dtype):
     # GH4094
     df = DataFrame({"a": Series([1, 2, 3], dtype=any_real_dtype)})
     update = df.copy()[:-1]
     df.update(update)
     assert df.a.dtype == any_real_dtype
示例#9
0
def get_google_fit_steps(fname: Union[Path, str],
                         data: pd.DataFrame) -> pd.DataFrame:
    """
    This function updates a dataframe with the JSON data
    gathered from the GoogleFit API.
    It updates the values in the `googlefitsteps` column.

    Params:
        fname: path to data folder for participant X
        data:  pandas data frame to store data
    """

    directory = Path(fname)

    # find json file with GoogleFit data
    for child in directory.iterdir():
        if child.suffix == ".json" and "GoogleFit" in child.stem:
            path_to_json = child

    # initiate interface to file
    json_interface = GoogleFitDataJSON(path_to_json)

    # return the extracted dataframe
    new_data = json_interface.df

    # update data
    data.update(new_data)

    return data
示例#10
0
def gather_frame_fields(df: pd.DataFrame,
                        other_df: pd.DataFrame,
                        index_label: str = None,
                        fields: list = None,
                        copy_frames: bool = False,
                        append_missing: bool = True,
                        **kwargs):

    if copy_frames:
        df = df.copy()
        other_df = other_df.copy()

    if index_label is not None:
        for frame in [df, other_df]:
            if frame.index.name is not index_label and index_label in frame.columns:
                frame.set_index(index_label, drop=False, inplace=True)

    if fields:
        other_df_orig = other_df.copy()
        other_df = other_df.loc[:, fields]
    else:
        other_df_orig = other_df

    df.update(other_df, **kwargs)

    if append_missing is True:
        df_add = other_df_orig.loc[~other_df_orig.index.isin(df.index), :]
        df = pd.concat([df, df_add])

    return df
    def _interpolate_column(self, dataframe: DataFrame, column_id: str,
                            time: float) -> DataFrame:
        person_df = pd.DataFrame(dataframe[column_id], index=dataframe.index)
        person_df = person_df.dropna()

        date_to_add, time_to_add = self._get_date_for_new_time(
            dataframe, column_id, time)

        # create new entry and add it
        new_df = pd.DataFrame([time_to_add],
                              columns=[column_id],
                              index=[date_to_add])
        person_df = person_df.append(new_df)

        # interpolate
        person_df = utils.interpolate_dates(person_df)

        # modify /!\ IN PLACE /!\ using non-NA values from another DataFrame
        dataframe.update(person_df)

        # append new values, if any
        rows_to_append = person_df.loc[person_df.index.difference(
            dataframe.index, sort=False)]
        dataframe = dataframe.append(rows_to_append, sort=False)

        return dataframe
示例#12
0
    def test_update_raise(self):
        df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2])
        with tm.assert_raises_regex(ValueError, "Data overlaps"):
            df.update(other, raise_conflict=True)
示例#13
0
def update_df(df: pd.DataFrame,
              new_df: pd.DataFrame,
              on: (str, list) = None,
              mode='update'):
    """
    根据某一列更新dataframe里的数据

    :param df: 待升级的
    :param new_df: 新表
    :param on: 根据哪一列升级,默认为None,使用index
    :param mode:处理方式,update:直接更新对应位置的数值,insert:只有对应位置为空时才更新
    :return:
    """
    v1 = len(df)
    if on is not None:
        on = ensure_list(on)
        new_df = new_df.drop_duplicates()
        if any(new_df[on].duplicated()):
            raise ValueError('new_df中有重复的索引列对应不同的值,请检查')
        new_df = df[on].drop_duplicates().merge(new_df, how='inner', on=on)
        df = df.set_index(on, drop=False)
        new_df = new_df.set_index(on, drop=False)
    if mode == 'update':
        df.update(new_df)
    elif mode == 'insert':
        df = df.combine_first(new_df)
    else:
        raise ValueError(f'参数{mode}错误,可选参数为 update or insert')
    df = df.reset_index(drop=True)
    if on is not None:
        if v1 != len(df):
            raise ValueError('update后Dataframe结构发生变化,请检查')
    return df
示例#14
0
def _pre_process(data: pd.DataFrame) -> pd.DataFrame:
    ymca_columns = [
        column for column in data.columns
        if column.startswith(C.COL_YMCA_PREFIX)
    ]
    data.update(_round_ymca_columns(data, ymca_columns))

    return data
示例#15
0
 def merge_users_and_ratings(self, users: pd.DataFrame,
                             ratings: pd.DataFrame):
     users = users.set_index('user_id')
     ratings = ratings.set_index('user_id')
     ratings['age'] = np.nan
     ratings['location'] = np.nan
     ratings.update(users)
     return ratings
示例#16
0
def fillna_all_by_str(data: pd.DataFrame, value="unk"):
    strs = []
    for i in data.columns:
        #print(data[i].dtype)
        if data[i].dtype == object:
            strs.append(i)
    print("fill cols:" + str(strs) + "\nvalues:" + value)
    data.update(data[strs].fillna(value))  # 回写
示例#17
0
def get_sbu(
    df: pd.DataFrame,
    project: str,
    start: Union[None, str, int] = None,
    end: Union[None, str, int] = None,
) -> None:
    """Acquire the SBU usage for each account in the :attr:`pandas.DataFrame.index`.

    The start and end of the reported interval can, optionally, be altered with **start**
    and **end**.
    Performs an inplace update of **df**, adding new columns to hold the SBU usage per month under
    the ``"Month'`` super-column.
    In addition, a single row and column is added (``"sum"``) with SBU usage summed over
    the entire interval and over all users, respectively.

    Parameters
    ----------
    df : :class:`pandas.DataFrame`
        A Pandas DataFrame with usernames and information, constructed by :func:`yaml_to_pandas`.
        :attr:`pandas.DataFrame.columns` and :attr:`pandas.DataFrame.index` should
        be instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively.
        User accounts are expected to be stored in :attr:`pandas.DataFrame.index`.
        SBU usage (including the sum) is stored in the ``"Month"`` super-column.

    start : :class:`int` or :class:`str`, optional
        Optional: The starting year of the interval.
        Defaults to the current year if ``None``.

    end : :class:`str` or :class:`int`, optional
        Optional: The final year of the interval.
        Defaults to current year + 1 if ``None``.

    project : :class:`str`, optional
        Optional: The project code of the project of interest.
        If not ``None``, only SBUs expended under this project are considered.

    """
    # Construct new columns in **df**
    sy, ey = get_date_range(start, end)
    date_range = _get_datetimeindex(sy, ey)
    for i in date_range:
        df[('Month', str(i)[:7])] = np.nan

    df_tmp = parse_accuse(project, sy, ey)
    df.update(df_tmp)

    # Calculate SBU sums
    SUM = ('Month', 'sum')
    df[SUM] = df['Month'].sum(axis=1)
    df.loc['sum'] = np.nan
    df.loc['sum', 'Month'] = df['Month'].sum(axis=0).values
    df.at['sum', PROJECT] = 'sum'
    df.at['sum', SBU_REQUESTED] = _get_total_sbu_requested(df)

    # Mark all active users
    df[ACTIVE] = False
    df.loc[df[SUM] > 1.0, ACTIVE] = True
示例#18
0
 def test_update_modify_view(self):
     # GH#47188
     df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]})
     df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]})
     result_view = df2[:]
     df2.update(df)
     expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]})
     tm.assert_frame_equal(df2, expected)
     tm.assert_frame_equal(result_view, expected)
示例#19
0
 def _project(self, df: pd.DataFrame, iteration: int) -> pd.DataFrame:
     """Project parameter space down to a 2D space."""
     self._log("Projecting parameter space down to 2D...")
     proj = self.configuration.local_configurations[iteration].projector
     proj_arr = proj(df[self.parameter_headers].to_numpy())
     df.update({
         pn: proj_arr[:, i]
         for i, pn in enumerate(self.projection_headers)
     })
     return df
示例#20
0
    def test_update_raise(self):
        df = DataFrame([[1.5, 1, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan],
                           [nan, 7]], index=[1, 3], columns=[1, 2])
        with assertRaisesRegexp(ValueError, "Data overlaps"):
            df.update(other, raise_conflict=True)
示例#21
0
    def test_update_raise_on_overlap(self):
        df = DataFrame([[1.5, 1, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan],
                           [nan, 7]], index=[1, 3], columns=[1, 2])
        with pytest.raises(ValueError, match="Data overlaps"):
            df.update(other, errors='raise')
示例#22
0
 def sanitize_exchange(self, beta, exchange):
     benchmark = DataFrame().reindex_like(beta)
     benchmark.update(self._BENCHMARKS[exchange])
     benchmark.fillna(method='ffill', inplace=True)
     benchmark.fillna(method='bfill', inplace=True)
     benchmark.dropna(inplace=True)
     benchmark = benchmark.transpose().copy()
     benchmark.replace(to_replace=0, method='ffill', inplace=True)
     benchmark = benchmark.transpose().copy()
     benchmark.dropna(inplace=True)
     return benchmark.copy()
示例#23
0
    def correct_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        assert df is not None
        assert len(df) > 2

        df_state_end = df.columns[df.columns.str.contains(
            self.statecol_pattern)]
        if self.interpolate:
            df_state_end.fillna(method='pad', inplace=True)
        df.update(df_state_end)

        return df
示例#24
0
    def test_update_dtypes(self):
        # gh 3016
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        df.update(other)

        expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
                             columns=['A', 'B', 'bool1', 'bool2'])
        assert_frame_equal(df, expected)
示例#25
0
def add_attribute_detail(view_part: pd.DataFrame):
    attributes = view_part.ATTR_PRD_WHL_VAL.str.split(',', expand=True)
    if attributes.shape[1] != 4:
        paste_leftover = attributes.iloc[:, 3:].apply(lambda x: ' '.join(x[~pd.isnull(x)].values), axis=1)
        attributes.loc[:, 3] = paste_leftover
        attributes = attributes.loc[:, :3]
        assert attributes.shape[1] == 4, 'attribute shape over 4'
    attributes = attributes.loc[:, [2, 3]].rename(columns={2: 'SIZE2', 3: 'SIZE3'})
    assert attributes.shape[1] == 2
    view_part.update(attributes)
    return view_part
示例#26
0
def fix_dates(dataframe:pd.DataFrame, dates_column:str):
    """Fix wrong dates in the PatentsView database
    Some (grant and application) dates on PatentsView are wrongly reported 
      and cannot be converted into proper dates. However, if you look on the
      PatentsView website, most (all?) are correct. Therefore, this module
      uses the PatentsView APIs to retrieve the correct dates 
      (or, as a second-best, it tries to fix them with a simple heuristic)
    """
    # Use the PatentsView API to fix those application dates 
    #  that cannot be coerced into proper dates
    dataframe['date_'] = pd.to_datetime(
        dataframe[dates_column], errors='coerce')
    dataframe.sort_values(by=['patent_id','date_'], inplace=True)
    dataframe.set_index('patent_id', inplace=True)
    patents_to_fix = ','.join([f'{{"patent_number":"{patent_id}"}}' \
        for patent_id in dataframe[dataframe.date_.isna()].index])
    patents_to_fix_n = sum(dataframe.date_.isna())
    if patents_to_fix_n>0:
        query = ''.join([
            'https://api.patentsview.org/patents/query?q={"_or":[',
            patents_to_fix,
            ']}&f=["patent_number","patent_date"]&o={"per_page":',
            str(patents_to_fix_n), '}'])
        response = requests.get(query)
        df_fix = pd.DataFrame(response.json()['patents'], dtype=str)
        df_fix.rename(columns={
            'patent_number':'patent_id', 
            'patent_date':dates_column}, inplace=True)
        df_fix.sort_values(by=['patent_id',dates_column], inplace=True)
        df_fix.set_index('patent_id', inplace=True)
        dataframe.update(df_fix)
    dataframe.drop(columns='date_', inplace=True)
    dataframe.reset_index(inplace=True)
    dataframe.sort_values(by=['patent_id',dates_column], inplace=True)
        
    # At this point, all the mistakes should have been fixed
    #  Anyhow, the script will fix dates that are possibly still wrong 
    #  applying some heuristic with the best guesses we can do, 
    #  given the information provided
    # Fix any date that has "00" as day, putting "01" inplace
    subset = dataframe[dates_column].str.endswith('00')
    if len(subset)>0:
        dataframe.loc[
            subset,dates_column] = dataframe.loc[
                subset,dates_column].str[:-2] + '01'
        # Fix any date that who's year doesn't start with "19" or "20", 
        #  putting "19" inplace
        subset = dataframe[dates_column].str[:2].isin(['19','20'])
        dataframe.loc[
            ~subset,dates_column] = '19' + dataframe.loc[
                ~subset,dates_column].str[2:]
    
    return dataframe
示例#27
0
    def test_update_dtypes(self):

        # gh 3016
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        df.update(other)

        expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
                             columns=['A', 'B', 'bool1', 'bool2'])
        assert_frame_equal(df, expected)
示例#28
0
    def test_update_nooverwrite(self):
        df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]],
                          index=[1, 3])

        df.update(other, overwrite=False)

        expected = DataFrame([[1.5, nan, 3], [1.5, 2, 3], [1.5, nan, 3],
                              [1.5, nan, 3.]])
        assert_frame_equal(df, expected)
示例#29
0
    def test_update(self):
        df = DataFrame([[1.5, np.nan, 3.0], [1.5, np.nan, 3.0],
                        [1.5, np.nan, 3], [1.5, np.nan, 3]])

        other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]],
                          index=[1, 3])

        df.update(other)

        expected = DataFrame([[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3],
                              [1.5, np.nan, 7.0]])
        tm.assert_frame_equal(df, expected)
示例#30
0
    def test_update_filtered(self):
        df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]],
                          index=[1, 3])

        df.update(other, filter_func=lambda x: x > 2)

        expected = DataFrame([[1.5, nan, 3], [1.5, nan, 3], [1.5, nan, 3],
                              [1.5, nan, 7.]])
        assert_frame_equal(df, expected)
示例#31
0
    def interpolate(self, predictor, freq):

        idx = pd.period_range(min(self.historical_data.index), max(self.historical_data.index), freq=freq)
        result = DataFrame(index=idx.to_timestamp())
        result['Temp1'] = np.NaN
        result.update(self.historical_data)
        X = result['Temp1']
        for i in range(result.__len__()):
            if np.isnan(X[ result.index[i] ]):
                prediction = predictor.predict( result[:i], interval=timedelta(minutes=5) )
                X[result.index[i]] = prediction
        #predictor.predict()
        return X
示例#32
0
    def retrieve_zij_counts_index(self):
        zij = self.g_props_to_df()
        common_index = list(set(self.prop_counts.index) | set(zij.index))

        df_tot = DataFrame(nan, columns=common_index, index=common_index)
        df_tot.update(zij)
        zij = df_tot.fillna(0.)

        vc_tot = Series(nan, index=common_index)
        vc_tot.update(self.prop_counts)
        vc_tot = vc_tot.fillna(0.)
        freqs = vc_tot / vc_tot.sum()
        return zij.values, freqs.values, common_index
示例#33
0
def add_to_col(df_to_update: pd.DataFrame, to_add: pd.Series,
               col_to_update_name):
    """
    FR : Ajoute les valeurs de to_add à la colonne col_to_update_name de la DataFrae df_to_update.
    EN : Add the values of to_add to the col_to_update_name column of the df_to_update DataFrame
    """
    df_to_update.update(
        pd.DataFrame({
            col_to_update_name:
            pd.concat([df_to_update[[col_to_update_name]], to_add],
                      axis=1,
                      sort=True).fillna(0).sum(axis=1)
        }))
示例#34
0
class InfoTable(DataFrameWidget):
    def __init__(self, samples=None):
        self.initVars()
        super(InfoTable, self).__init__(self.table)

    def initVars(self):
        """Initialises variables."""
        self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"]
        self.table = DataFrame(columns=self.columns)

    ########################################################################
    def update(self):
        plateID = self.table["Plate ID"]
        plateName = self.table["Plate Name"]
        plateKea = self.table["Plate Kea"]
        well = self.table["Well"]
        self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1)
        self.table.insert(0, "Plate ID", plateID)
        self.table.insert(1, "Plate Name", plateName)
        self.table.insert(2, "Plate Kea", plateKea)
        self.table.insert(3, "Well", well)
        self.setDataFrame(self.table)

    def append(self, appendage):
        self.table = self.table.append(appendage, ignore_index=True)
        self.update()

    def editPlates(self, edits):
        self.table = self.table.set_index("Plate ID")
        edits = edits.set_index("ID")
        self.table.update(edits)
        self.table = self.table.reset_index()

    def importPlateData(self, plateData, key):
        plateData = plateData.set_index(key)
        self.table = self.table.set_index(key)
        self.table.update(plateData)
        self.table = self.table.reset_index()

    def importSampleData(self, sampleData, tableKey, importKey):
        sampleData[tableKey] = sampleData[importKey]
        sampleData = sampleData.set_index(tableKey)
        self.table = self.table.set_index(tableKey)
        self.table = self.table.join(sampleData, rsuffix="_new")
        self.table = self.table.reset_index()

    def getKeaSexTestingData(self):
        table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]]
        table = table.set_index(["Plate ID", "Well"])
        table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True)
        return table
示例#35
0
    def test_update_nooverwrite(self):
        df = DataFrame([[1.5, nan, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan],
                           [np.nan, np.nan, 7]], index=[1, 3])

        df.update(other, overwrite=False)

        expected = DataFrame([[1.5, nan, 3],
                              [1.5, 2, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 3.]])
        assert_frame_equal(df, expected)
示例#36
0
    def test_update_dtypes(self):

        # gh 3016
        df = DataFrame(
            [[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
            columns=["A", "B", "bool1", "bool2"],
        )

        other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
        df.update(other)

        expected = DataFrame(
            [[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
            columns=["A", "B", "bool1", "bool2"],
        )
        tm.assert_frame_equal(df, expected)
示例#37
0
    def test_update_filtered(self):
        df = DataFrame([[1.5, nan, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan],
                           [np.nan, np.nan, 7]], index=[1, 3])

        df.update(other, filter_func=lambda x: x > 2)

        expected = DataFrame([[1.5, nan, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 7.]])
        assert_frame_equal(df, expected)
示例#38
0
    def test_update_nan(self):
        # #15593 #15617
        # test 1
        df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
        df2 = DataFrame({'A': [None, 2, 3]})
        expected = df1.copy()
        df1.update(df2, overwrite=False)

        tm.assert_frame_equal(df1, expected)

        # test 2
        df1 = DataFrame({'A': [1.0, None, 3],
                         'B': date_range('2000', periods=3)})
        df2 = DataFrame({'A': [None, 2, 3]})
        expected = DataFrame({'A': [1.0, 2, 3],
                              'B': date_range('2000', periods=3)})
        df1.update(df2, overwrite=False)

        tm.assert_frame_equal(df1, expected)
示例#39
0
    def test_update_from_non_df(self):
        d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
        df = DataFrame(d)

        d['a'] = Series([5, 6, 7, 8])
        df.update(d)

        expected = DataFrame(d)

        assert_frame_equal(df, expected)

        d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
        df = DataFrame(d)

        d['a'] = [5, 6, 7, 8]
        df.update(d)

        expected = DataFrame(d)

        assert_frame_equal(df, expected)
示例#40
0
 def test_update_datetime_tz(self):
     # GH 25807
     result = DataFrame([pd.Timestamp('2019', tz='UTC')])
     result.update(result)
     expected = DataFrame([pd.Timestamp('2019', tz='UTC')])
     assert_frame_equal(result, expected)
示例#41
0
 def test_update_deprecation(self, raise_conflict):
     df = DataFrame([[1.5, 1, 3.]])
     other = DataFrame()
     with tm.assert_produces_warning(FutureWarning):
         df.update(other, raise_conflict=raise_conflict)
示例#42
0
 def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
     df = DataFrame([[1.5, 1, 3.]])
     with pytest.raises(exception, match=msg):
         df.update(df, **bad_kwarg)
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
示例#44
0
different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two'])
original.reindex_like(different)
original.reindex_axis(['two','one'], axis = 1)

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three'])
left.merge(right,on='one') # Same as how='inner'
left.merge(right,on='one', how='left')
left.merge(right,on='one', how='right')
left.merge(right,on='one', how='outer')

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
left
right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3])
right
left.update(right) # Updates values in left
left

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']]
subset.head()
grouped_data = subset.groupby(by='region')
grouped_data.groups # Lists group names and index labels for group membership
grouped_data.mean()  # Same as a pivot table

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','gdp_growth_2011','gdp_growth_2012']]
subset.index = state_gdp['state_code'].values
subset.head()
subset.apply(mean) # Same as subset.mean()
subset.apply(mean, axis=1).head() # Same as subset.mean(axis=1)

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']]
示例#45
0
class ResultsTable(DataFrameWidget):
	'''The Class implementing the table in the Results tab of fyd2.'''
	def __init__(self,samples=None):
		self.initVars()
		super(ResultsTable,self).__init__(self.table)
		
	def initVars(self):
		'''Initialises variables.'''
		self.columns					= ['Plate ID','Plate Name','Plate Kea','Well','Population',
											'Crop','Experiment','LC Well','Result','Group',
											'Exists','Grind','Concentration','Include']
		self.table						= DataFrame(columns=self.columns)
	
########################################################################
	def update(self):
		'''Resets the booleans to booleans and reorders the columns.'''
		self.table['Exists']			= self.table['Exists'].map(lambda e: bool(e))
		self.table['Grind']				= self.table['Grind'].map(lambda g: bool(g))
		self.table['Include']			= self.table['Include'].map(lambda g: bool(g))
		self.table						= self.table[self.columns]
		self.setDataFrame(self.table)
		
	def append(self,appendage):
		'''Append the samples in appendage to the table.'''
		self.table						= self.table.append(appendage,ignore_index=True)

	def editPlates(self,edits):
		'''Finds plates by Plate ID and edits data. Used in the 'Edit Data'
		menu item.'''
		self.table				= self.table.set_index('Plate ID')
		edits					= edits.set_index('ID')
		self.table.update(edits)
		self.table				= self.table.reset_index()

########################################################################		
	def importPlateData(self,plateData,key):
		'''Updates the results table with the data read from the Plates
		Records spreadsheet.'''
		plateData				= plateData.set_index(key)
		self.table				= self.table.set_index(key)
		self.table.update(plateData)
		self.table				= self.table.reset_index()
		plateData				= plateData.reset_index()
	
		self.importPlateDataNonSamples(plateData)
		self.importPlateDataBadGrinds(plateData)
		
	def importPlateDataNonSamples(self,plateData):
		'''Updates the non-existing sample column from plateData, which 
		has been read from the Plates Records spreadsheet.'''
		plateData				= plateData[['Plate ID','Non-harvested plants']]
		exists					= plateData.dropna(how='all',subset=['Non-harvested plants'])
		exists					= DataFrame(exists['Non-harvested plants'].str.split(' ').tolist(),index=exists['Plate ID']).stack()
		exists					= exists.reset_index().drop('level_1',1)
		exists.columns			= ['Plate ID','Well']
		exists['Exists']		= False
		exists					= exists.set_index(['Plate ID','Well'])
		self.table				= self.table.set_index(['Plate ID','Well'])
		self.table.update(exists)
		self.table				= self.table.reset_index()
		
	def importPlateDataBadGrinds(self,plateData):
		'''Updates the Bad Grinds column from plateData, which has been
		read from the Plates Records spreadsheet.'''
		plateData				= plateData[['Plate ID','Bad Grinds']]
		badGrinds				= plateData.dropna(how='all',subset=['Bad Grinds'])
		badGrinds				= DataFrame(badGrinds['Bad Grinds'].str.split(' ').tolist(),index=badGrinds['Plate ID']).stack()
		badGrinds				= badGrinds.reset_index().drop('level_1',1)
		badGrinds.columns		= ['Plate ID','Well']
		badGrinds['Grind']		= False
		badGrinds				= badGrinds.set_index(['Plate ID','Well'])
		self.table				= self.table.set_index(['Plate ID','Well'])
		self.table.update(badGrinds)
		self.table				= self.table.reset_index()

########################################################################		
	def setCrop(self,crop):
		'''Sets the item in the crop menu for all samples.'''
		self.table['Crop']		= crop
		
	def addLCFiles(self,fileDataLists):
		'''Adds lightcyler results for given fileDataLists.'''
		lc							= DataFrame(columns=['LC Well','Result','Experiment','Plate','Well'])
		for fDL in fileDataLists:
			name,plate,exp,robot,pos= fDL
			lcFrame					= read_table(name,sep='\t',header=1)
			lcFrame					= lcFrame.drop(['Include','Color','Name','Status'],1)
			lcFrame['Experiment']	= exp
			lcFrame['Plate']		= plate
			lcFrame['Well']			= lcFrame['Pos'].map(lambda x: convert[robot][pos][x])
			lcFrame['Group']		= lcFrame['Group'].astype(str)
			lcFrame.columns			= ['LC Well','Result','Experiment','Plate','Well',]
			lc						= lc.append(lcFrame)
		lc							= lc.set_index(['Plate','Well'])
		self.table					= self.table.set_index(['Plate ID','Well'])
		self.table.update(lc)
		self.table					= self.table.reset_index()
		
	def addTaqFiles(self,fileDataLists):
		'''Adds Taqman results for given fileDataLists.'''
		lc							= DataFrame(columns=['LC Well','Result','Experiment','Plate','Well'])
		for fDL in fileDataLists:
			name,plate,exp,robot,pos= fDL
			lcFrame					= read_table(name,sep='\t',header=1)
			lcFrame					= lcFrame.drop(['Include','Color','465-510','618-660','Score','Status'],1)
			lcFrame['Experiment']	= exp
			lcFrame['Plate']		= plate
			lcFrame['Well']			= lcFrame['Pos'].map(lambda x: convert[robot][pos][x])
			lcFrame['Call']			= lcFrame['Call'].astype(str)
			lcFrame					= lcFrame[['Pos','Call','Experiment','Plate','Well']]
			lcFrame.columns			= ['LC Well','Result','Experiment','Plate','Well',]
			lc						= lc.append(lcFrame)
		lc							= lc.set_index(['Plate','Well'])
		self.table					= self.table.set_index(['Plate ID','Well'])
		self.table.update(lc)
		self.table					= self.table.reset_index()

########################################################################
	def negativiseUnknowns(self):
		'''Sets all Unknown results to Negative. Used for Brassica/Ryegrass.'''
		self.table['Result']		= self.table['Result'].map(lambda res: 'Negative' if res == 'Unknown' else res)
		
	def setNonExistsToNegative(self):
		'''Reads the Exists column, and if it is False, sets the corresponding
		item in the Group column to Negative.'''
		self.table['Group']			= self.table.apply(lambda x: x['Group'] if x['Exists'] else 'Negative',1)
	
	def setNonExistsToNoSample(self):
		'''Reads the Exists column, and if it is False, sets the corresponding
		item in the Group column to No Sample.'''
		self.table['Group']			= self.table.apply(lambda x: x['Group'] if x['Exists'] else 'No sample',1)

	def includeAll(self):
		'''Sets the Include column to True for all samples.'''
		self.table['Include']		= True
		
	def excludeFailGrinds(self):
		'''Sets the Include column to False for failed grinds.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Grind']) else x['Include'],1)
		
	def excludeNegativeFailGrinds(self):
		'''Sets the Include column to False for failed grinds whose Group
		is Negative.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Grind']) and x['Group']=='Negative' else x['Include'],1)
		
	def excludeNonExists(self):
		'''Sets the Include column to False for samples that don't exist.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Exists']) else x['Include'],1)

	def missingResults(self):
		'''Determines if there are missing entries in Results or Groups'''
		return self.table['Result'].isnull().any(), self.table['Group'].isnull().any()
		
########################################################################
	def getPopulations(self):
		'''Returns a list of all populations in Population column.'''
		return unique(self.table.Population.ravel())
		
	def getGroups(self):
		'''Returns a list of all groups in Group column.'''
		return unique(self.table.Group.ravel())
		
	def getExperiments(self):
		'''Returns a list of all experiments in Experiment column.'''
		return unique(self.table.Experiment.ravel())
		
	def getCherriesByPop(self,cherryData):
		'''Gets cherrypicking data for passed populations/results.'''
		cherries					= DataFrame(columns=['Source plate','Dest plate',
														'Source Position','Source Well',
														'Destination Position','Destination Well',
														'Volume (ul)','Run'])
		for pop,groups,samples in cherryData:
			cherry					= self.table[['Plate ID','Well','Population','Group']]
			cherry					= cherry.loc[(cherry.Population==pop) & (cherry.Group.isin(groups))]
			cherry					= cherry.reset_index().head(samples)[['Plate ID','Well']]
			cherry.columns			= ['Source plate','Source Well']
			cherries				= cherries.append(cherry)
		cherries					= self.fillCherryData(cherries)
		return cherries
		
	def getCherriesNU(self):
		'''Gets cherrypicking data for Negatives and Unknowns.'''
		cherries					= DataFrame(columns=['Source plate','Dest plate',
														'Source Position','Source Well',
														'Destination Position','Destination Well',
														'Volume (ul)','Run'])
		cherry						= self.table[['Plate ID','Well','Group']]
		cherry						= cherry.loc[cherry.Group.isin(['Negative','Unknown'])]
		cherry						= cherry.reset_index()[['Plate ID','Well']]
		cherry.columns				= ['Source plate','Source Well']
		cherries					= cherries.append(cherry)
		cherries					= self.fillCherryData(cherries)
		return cherries
	
	def fillCherryData(self,cherries):
		'''Fills out the cherrypicking datatables, with Source Position,
		Destination Position, Destination Well, Run, and Volume.'''
		controls					= 2
		plateNames					= 'CP'
		sources						= [4,5,7,8]
		dests						= [10,11]
		wellsList					= [l+str(n) for n in range(1,13) for l in 'ABCDEFGH'][controls:]
		wells						= len(wellsList)
		rows						= len(cherries)
		cherries['Volume (ul)']		= 50
		cherries['Destination Well']= wellsList * (rows/wells) + wellsList[:rows%wells]
		cherries['Dest plate']		= [plateNames + str(i/wells+1) for i in range(rows)]
		sourcePlates				= unique(cherries['Source plate'].ravel())
		destPlates					= unique(cherries['Dest plate'].ravel())
		sourceDict 					= {i: j for i,j in itertools.izip(sourcePlates,itertools.cycle(sources))}
		destDict					= {i: j for i,j in itertools.izip(destPlates,itertools.cycle(dests))}
		cherries['Source Position']	= cherries['Source plate'].apply(lambda x: sourceDict[x],1)
		cherries['Destination Position']	= cherries['Dest plate'].apply(lambda x: destDict[x],1)
		self.run					= 1
		self.sources				= []
		self.dests					= []
		Run							= []
		for row in cherries.itertuples():
			source					= row[5]
			dest					= row[2]
			if not self.sources or source != self.sources[-1]:
				self.sources.append(source)
			if not self.dests or dest != self.dests[-1]:
				self.dests.append(dest)
			if len(self.sources) > 4 or len(self.dests) > 2:
				self.run = self.run + 1
				self.sources		= [source]
				self.dests			= [dest]
			Run.append('Run ' + format(self.run,'03d'))
		cherries['Run']				= Run
		cherries['Source Position']	= cherries['Source Position'].apply(lambda x: 'P'+str(x)) 
		cherries['Destination Position']	= cherries['Destination Position'].apply(lambda x: 'P'+str(x))
		cherries					= cherries[['Source plate','Dest plate','Source Position','Source Well',
												'Destination Position','Destination Well','Volume (ul)','Run']]
		return cherries
		
########################################################################
	def getKeaSexTestingData(self):
		'''Gets the data required by the Kea Sex testing process run.
		Some conversion is required.'''
		kea							= {'Male': 		'M',
										'Female':	'F',
										'1':		'1',
										'2':		'2',
										'3':		'3',
										'4':		'4',
										'Negative':	'U',
										'Unknown':	'U',
										'No Sample':'U',}
		data						= ['Plate ID','Plate Kea','Well','Experiment','Group']
		
		table						= self.table[data]
		table['Group']				= table['Group'].apply(lambda x: kea.get(x,'U'))
		table.rename(columns={'Plate Kea':			'Plate',
								'Experiment':		'Slipstream Expt No',
								'Plant Alt Names':	'Plant AltName',
								'Group':			'Sex Marker Results'}, inplace=True)
		table						= table.set_index(['Plate ID','Well'])
		return table
示例#46
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'