Пример #1
0
def drop_reqpeat01():
    data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
    print data
    print data.duplicated()
    print data.drop_duplicates()
    data['v1']=range(7)
    print data.drop_duplicates(['k1'])
    print data
    print data.drop_duplicates(['k1','k2'],keep='last')
Пример #2
0
def test_frame_datetime64_duplicated():
    dates = date_range('2010-07-01', end='2010-08-05')

    tst = DataFrame({'symbol': 'AAA', 'date': dates})
    result = tst.duplicated(['date', 'symbol'])
    assert (-result).all()

    tst = DataFrame({'date': dates})
    result = tst.duplicated()
    assert (-result).all()
Пример #3
0
def test_duplicated_with_misspelled_column_name(subset):
    # GH 19730
    df = DataFrame({'A': [0, 0, 1],
                    'B': [0, 0, 1],
                    'C': [0, 0, 1]})

    with pytest.raises(KeyError):
        df.duplicated(subset)

    with pytest.raises(KeyError):
        df.drop_duplicates(subset)
Пример #4
0
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Пример #5
0
def slide_10():
    data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                      'k2': [1, 1, 2, 3, 3, 4, 4]})
    print data
    print data.duplicated()
    print data.duplicated('k1')
    print data.drop_duplicates()

    data['v1'] = range(7)
    print data
    print data.drop_duplicates(['k1'])
    print data.drop_duplicates(['k1', 'k2'], take_last=True)
Пример #6
0
class Duplicated(object):

    def setup(self):
        n = (1 << 20)
        t = date_range('2015-01-01', freq='S', periods=(n // 64))
        xs = np.random.randn(n // 64).round(2)
        self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
                             'b': np.random.choice(t, n),
                             'c': np.random.choice(xs, n)})
        self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T

    def time_frame_duplicated(self):
        self.df.duplicated()

    def time_frame_duplicated_wide(self):
        self.df2.duplicated()
Пример #7
0
    def submit(self, df: pd.DataFrame, job_opts: JobOpts, deplay=0.02, progressbar=True):
        """Sumit jobs to the cluster.

        You have to establish a connection first (explicit is better than implicit).

        Examples:
            >>> with js.connect():
            ...     js.submit([(0, 'echo "Hello world!"), (1, 'echo "Goodbye world!"')]
        """
        assert 'system_command' in df
        assert not df.duplicated().any()

        job_opts.working_dir.joinpath(job_opts.job_id).mkdir(parents=True, exist_ok=True)

        if self.host_opts.scheme in ['local']:
            worker = functools.partial(self._local_worker, job_opts=job_opts)
        else:
            worker = functools.partial(self._remote_worker, job_opts=job_opts)

        # Submit multiple jobs in parallel
        futures = []
        pool = concurrent.futures.ThreadPoolExecutor()
        for row in self._itertuples(df, progressbar=progressbar):
            future = pool.submit(worker, row)
            futures.append(future)
            time.sleep(deplay)
        pool.shutdown(wait=False)
        return futures
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid):
    dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])]
    # print(dfm_duplicated)
    dfm_stk_strc.drop_duplicates('变动日期',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因']
        dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因']
        logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
Пример #9
0
def test_duplicated_on_empty_frame():
    # GH 25184

    df = DataFrame(columns=['a', 'b'])
    dupes = df.duplicated('a')

    result = df[dupes]
    expected = df.copy()
    tm.assert_frame_equal(result, expected)
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Пример #11
0
def test_duplicated_do_not_fail_on_wide_dataframes():
    # gh-21524
    # Given the wide dataframe with a lot of columns
    # with different (important!) values
    data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
    df = DataFrame(data).T
    result = df.duplicated()

    # Then duplicates produce the bool Series as a result and don't fail during
    # calculation. Actual values doesn't matter here, though usually it's all
    # False in this case
    assert isinstance(result, Series)
    assert result.dtype == np.bool_
Пример #12
0
def jhu_world_normalize(df: pd.DataFrame) -> pd.DataFrame:
    # We are just gonna do per country data in this CSV file
    df = df.drop(columns=['Province/State'])
    # Change column names to the ones we use
    df = df.rename(columns={
        'Country/Region': 'country',
        'Lat': 'latitude',
        'Long': 'longitude'
    })

    # Remove duplicate countries
    dup_countries = {'Bahamas, The', 'Congo (Brazzaville)'}

    df = df[~df.country.isin(dup_countries)]

    # Change Korea, South to South Koreaa
    df.country = df.country.replace({
        'Korea, South': 'South Korea',
        'US': 'United States',
        'The Bahamas': 'Bahamas',
        'Congo (Kinshasa)': 'Democratic Republic of the Congo',
        'Czechia': 'Czech Republic',
        'Taiwan*': 'Taiwan',
        'Cruise Ship': 'Diamond Princess',
        'Cote d\'Ivoire': 'Ivory Coast'
    })

    # JHU data has a PK of (region, country) so we need to sum up the rows that are dates for each one
    dup = df[df.duplicated(['country'])]
    dup = dup.groupby('country').sum()
    dup = dup.reset_index()
    dup.latitude = 0
    dup.longitude = 0

    # Unique countries
    df = df[~df.duplicated(['country'], keep=False)]
    df = df.append(dup)

    return df
Пример #13
0
def handle_data(data, labels):
    print("All data sets size: " + str(len(data)))
    test_data = []
    train_data = []
    for i in range(len(data)):
        if i % DIVISION_SCOPE == 0:
            test_data.append(data[i])
        else:
            train_data.append(data[i])
    print("Train Sets size: " + str(len(train_data)))
    print("Test Sets size: " + str(len(test_data)))
    test_data_df = DataFrame(test_data, columns=labels)
    train_data_df = DataFrame(train_data, columns=labels)
    # Clean Data Frame
    a_d = train_data_df.duplicated()
    b_d = test_data_df.duplicated()
    train_data_df = train_data_df.drop_duplicates()
    test_data_df = test_data_df.drop_duplicates()
    print("The Duplicated items in Train Sets size: " +
          str(len(a_d) - len(train_data_df)))
    print("The Duplicated items in Test Sets size: " +
          str(len(b_d) - len(test_data_df)))
    a_n = train_data_df.isnull()
    b_n = test_data_df.isnull()
    train_data_df = train_data_df.dropna()
    test_data_df = test_data_df.dropna()
    print("The Null items in Train Sets size: " +
          str(len(a_n) - len(train_data_df)))
    print("The Null items in Test Sets size: " +
          str(len(b_n) - len(test_data_df)))
    print("Clean Done. The Train Data Sets size: " + str(len(train_data_df)))
    print("Clean Done. The Test Data Sets size: " + str(len(test_data_df)))
    # train_data_df = train_data_df.drop(['used_app_before', 'contry_of_res', 'austim', 'jundice',
    #                                     'ethnicity', 'gender', 'age', 'A10_Score'], axis=1)
    # test_data_df = test_data_df.drop(['used_app_before', 'contry_of_res', 'austim', 'jundice',
    #                                     'ethnicity', 'gender', 'age', 'A10_Score'], axis=1)
    # train_data_df = train_data_df.drop(['age_desc'], axis=1)
    # test_data_df = test_data_df.drop(['age_desc'], axis=1)
    return test_data_df, train_data_df
Пример #14
0
def describe_table(df: pd.DataFrame, variable_stats: pd.DataFrame) -> dict:
    """General statistics for the DataFrame.
    Args:
      df: The DataFrame to describe.
      variable_stats: Previously calculated statistic on the DataFrame.
    Returns:
        A dictionary that contains the table statistics.
    """
    n = len(df)

    memory_size = df.memory_usage(index=True).sum()
    record_size = float(memory_size) / n

    table_stats = {
        "n":
        n,
        "nvar":
        len(df.columns),
        "memsize":
        memory_size,
        "recordsize":
        record_size,
        "n_cells_missing":
        variable_stats.loc["n_missing"].sum(),
        "n_vars_with_missing":
        sum((variable_stats.loc["n_missing"] > 0).astype(int)),
        "n_vars_all_missing":
        sum((variable_stats.loc["n_missing"] == n).astype(int)),
    }

    table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / (
        table_stats["n"] * table_stats["nvar"])

    supported_columns = variable_stats.transpose()[variable_stats.transpose(
    ).type != Variable.S_TYPE_UNSUPPORTED].index.tolist()
    table_stats["n_duplicates"] = (sum(df.duplicated(
        subset=supported_columns)) if len(supported_columns) > 0 else 0)
    table_stats["p_duplicates"] = ((table_stats["n_duplicates"] / len(df)) if
                                   (len(supported_columns) > 0
                                    and len(df) > 0) else 0)

    # Variable type counts
    table_stats.update({k.value: 0 for k in Variable})
    table_stats.update(
        dict(variable_stats.loc["type"].apply(
            lambda x: x.value).value_counts()))
    table_stats[Variable.S_TYPE_REJECTED.value] = (
        table_stats[Variable.S_TYPE_CONST.value] +
        table_stats[Variable.S_TYPE_CORR.value] +
        table_stats[Variable.S_TYPE_RECODED.value])
    return table_stats
Пример #15
0
def proportionBySport(df: pd.DataFrame, yr: int, sport: str,
                      gdr: str) -> float:
    """
        The function answers questions like the following :
            “What was the percentage of female basketball players among all
            the female participants of the 2016 Olympics?”
    Returns:
        float: Percentage of participants who played the given sport among
               the participants of the given gender.
    """
    df = df[(df["Year"] == yr) & (df["Sex"] == gdr)]
    df = df[~df.duplicated(subset=["ID"])]
    df_res = df[df["Sport"] == sport]
    return (df_res.shape[0] / df.shape[0])
Пример #16
0
def validate(df:pd.DataFrame):
    '''Checks if the scraped data is valid and as expected.'''
    log("Running validity check")
    if df.empty:
        # End the program if no data was scraped
        log("Validity check failure: no data, program ended after " + str(pd.Timestamp.now() - start))
        raise Exception("Validity check failure: empty dataframe")
    else:
        if df.isnull().values.any():
            # Check for empty cells in the table
            log("Validity check: empty detected")
        if df.duplicated().any():
            # Check for duplicate lines in the table
            log("Validity check: duplicates detected")
Пример #17
0
def test_duplicated_do_not_fail_on_wide_dataframes():
    # gh-21524
    # Given the wide dataframe with a lot of columns
    # with different (important!) values
    data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
            for i in range(100)}
    df = DataFrame(data).T
    result = df.duplicated()

    # Then duplicates produce the bool Series as a result and don't fail during
    # calculation. Actual values doesn't matter here, though usually it's all
    # False in this case
    assert isinstance(result, Series)
    assert result.dtype == np.bool
Пример #18
0
    def __add_duplicates_to_tree_and_remove_spurious_leaves(
        self,
        tree: nx.DiGraph,
        character_matrix: pd.DataFrame,
        node_name_generator: Generator[str, None, None],
    ) -> nx.DiGraph:
        """Append duplicates and prune spurious extant lineages from the tree.

        Places samples removed in removing duplicates in the tree as sisters
        to the corresponding cells that share the same mutations. If any extant
        nodes that are not in the original character matrix are present, they
        are removed and their lineages are pruned such that the remaining
        leaves match the set of samples in the character matrix.

        Args:
            tree: The tree after solving
            character_matrix: Character matrix

        Returns:
            The tree with duplicates added and spurious leaves pruned
        """

        character_matrix.index.name = "index"
        duplicate_groups = (character_matrix[character_matrix.duplicated(
            keep=False) == True].reset_index().groupby(
                character_matrix.columns.tolist())["index"].agg(
                    ["first", tuple]).set_index("first")["tuple"].to_dict())

        for i in duplicate_groups:
            new_internal_node = next(node_name_generator)
            nx.relabel_nodes(tree, {i: new_internal_node}, copy=False)
            for duplicate in duplicate_groups[i]:
                tree.add_edge(new_internal_node, duplicate)

        # remove extant lineages that don't correspond to leaves
        to_drop = []
        leaves = [n for n in tree if tree.out_degree(n) == 0]
        for l in leaves:
            if l not in character_matrix.index:
                to_drop.append(l)

                parent = [p for p in tree.predecessors(l)][0]
                while tree.out_degree(parent) < 2:
                    to_drop.append(parent)
                    parent = [p for p in tree.predecessors(parent)][0]

        tree.remove_nodes_from(to_drop)

        return tree
Пример #19
0
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    We want to one-hot encode the categories ultimately having each as its own column and each row with a 0/1
    if that example belongs to the category.

    We needed to merge initially to ensure categories matched with messages, but now we form a new categories df
    from the merged df categories column, splitting the categories on ';', creating column names, and then retaining
    only the binary 0/1 in as the row values. We finally drop the categories column in the original df and concat our
    new one-hot encoded categories df with it.


    :param df: the dataframe from load_data()
    :return: A new pandas dataframe with our categories one-hot encoded.
    """
    # We split the categories on the ; delimiter expanding into their own columns with expand=True
    categories = df.categories.str.split(';', expand=True)

    # Take the first row and remove the -N from each category to serve as column names
    row = categories.iloc[0, :]
    # use this row to extract a list of new column names for categories.
    category_column_names = row.apply(lambda x: x[:-2]).values
    categories.columns = category_column_names

    # For each column we want the row values to be just the binary numeric part 1/0
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].str[-1]
        
        # convert column from string to numeric
        categories[column] = categories[column].apply(int)

    # Now we drop the categories from the merged df and concat with the
    #  new categories frame (note axis=1 meaning concat on columns)
    df.drop(columns=['categories'], inplace=True)
    df = pd.concat([df, categories], axis=1)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Check we now have no duplicates
    assert df[df.duplicated()].shape[0] == 0

    # Set 2 to 1 in the related column
    df.loc[df.related == 2] = 1

    # Also child_alone always 0 so useless
    df.drop(columns=['child_alone'], inplace=True)

    return df
Пример #20
0
    def _general_dict(self, scenario):
        """ Generate the meta-information that holds for all runs (scenario info etc)

        Parameters
        ----------
        scenario: smac.Scenario
            scenario file to get information from
        """
        # general stores information that holds for all runs, runspec holds information on a run-basis
        general = OrderedDict()

        if len(self.runscontainer.get_budgets()) > 1:
            general['# budgets'] = len(self.runscontainer.get_budgets())
        if len(self.runscontainer.get_folders()) > 1:
            general['# parallel runs'] = len(self.runscontainer.get_folders())

        # Scenario related
        general['# parameters'] = len(scenario.cs.get_hyperparameters())
        general['Deterministic target algorithm'] = scenario.deterministic
        general['Optimized run objective'] = scenario.run_obj
        if scenario.cutoff or scenario.run_obj == 'runtime':
            general['Cutoff'] = scenario.cutoff
        if any([str(lim)!='inf' for lim in [scenario.wallclock_limit, scenario.ta_run_limit, scenario.algo_runs_timelimit]]):
            general['Walltime budget'] = scenario.wallclock_limit
            general['Runcount budget'] = scenario.ta_run_limit
            general['CPU budget'] = scenario.algo_runs_timelimit
        # Instances
        num_train, num_test = [len([i for i in insts if i]) for insts in [scenario.train_insts, scenario.test_insts]]
        if num_train > 0 or num_test > 0:
            general['# instances (train/test)'] = "{} / {}".format(num_train, num_test)
        # Features
        num_feats = scenario.n_features if scenario.feature_dict else 0
        num_dup_feats = 0
        if scenario.feature_dict:
            dup_feats = DataFrame(scenario.feature_array)
            num_dup_feats = len(dup_feats[dup_feats.duplicated()])  # only contains train instances
        if num_feats > 0:
            general['# features (duplicates)'] = "{} ({})".format(num_feats, num_dup_feats)

        general['----------'] = '----------'

        combined_run = self.runscontainer.get_aggregated(False, False)[0]
        combined_stats = self._stats_for_run(combined_run.original_runhistory,
                                             combined_run.scenario,
                                             combined_run.incumbent)
        for k, v in combined_stats.items():
            general[k] = v

        return general
Пример #21
0
def get_table_stats(df: pd.DataFrame, variable_stats: dict) -> dict:
    """General statistics for the DataFrame.

    Args:
      df: The DataFrame to describe.
      variable_stats: Previously calculated statistic on the DataFrame.

    Returns:
        A dictionary that contains the table statistics.
    """
    n = len(df)

    memory_size = df.memory_usage(deep=config["memory_deep"].get(bool)).sum()
    record_size = float(memory_size) / n

    table_stats = {
        "n": n,
        "n_var": len(df.columns),
        "memory_size": memory_size,
        "record_size": record_size,
        "n_cells_missing": 0,
        "n_vars_with_missing": 0,
        "n_vars_all_missing": 0,
    }

    for series_summary in variable_stats.values():
        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
            table_stats["n_vars_with_missing"] += 1
            table_stats["n_cells_missing"] += series_summary["n_missing"]
            if series_summary["n_missing"] == n:
                table_stats["n_vars_all_missing"] += 1

    table_stats["p_cells_missing"] = table_stats["n_cells_missing"] / (
        table_stats["n"] * table_stats["n_var"])

    supported_columns = [
        k for k, v in variable_stats.items() if v["type"] != Unsupported
    ]
    table_stats["n_duplicates"] = (sum(df.duplicated(
        subset=supported_columns)) if len(supported_columns) > 0 else 0)
    table_stats["p_duplicates"] = ((table_stats["n_duplicates"] / len(df)) if
                                   (len(supported_columns) > 0
                                    and len(df) > 0) else 0)

    # Variable type counts
    table_stats.update(
        {"types": dict(Counter([v["type"] for v in variable_stats.values()]))})

    return table_stats
Пример #22
0
    def summarize_dataframe(self, source: pd.DataFrame, name: str,
                            target_dict: dict, skip: List[str]):
        target_dict["name"] = name
        target_dict["num_rows"] = len(source)
        target_dict["num_columns"] = len(source.columns)
        target_dict["num_skipped_columns"] = len(source.columns) - len(
            [x for x in source.columns if x not in skip])

        target_dict["memory_total"] = source.memory_usage(index=True,
                                                          deep=True).sum()
        target_dict["memory_single_row"] = \
            float(target_dict["memory_total"]) / target_dict["num_rows"]

        target_dict["duplicates"] = NumWithPercent(sum(source.duplicated()),
                                                   len(source))
Пример #23
0
def delete_duplicates(from_date=None,
                      to_date=None,
                      expenses: pd.DataFrame = None):
    if (not from_date or not to_date) and expenses.empty:
        expenses = get_expenses(from_date, to_date)
    expenses = expenses[expenses.duplicated(subset=['amount', 'date'],
                                            keep=False)]
    merged = expenses.merge(right=expenses, on=['amount', 'date'])
    merged: pd.DataFrame = merged[merged['id_x'] != merged['id_y']]
    merged = merged.iloc[::2, :]
    merged['score'] = merged.apply(similar, axis=1)
    duplicates = merged[merged['score'] > 0.6]
    logger.info(
        f'Duplicate entries: {"none" if duplicates.empty else duplicates}')
    duplicates.apply(delete_expense, axis=1)
Пример #24
0
def test_duplicated_subset(subset, keep):
    df = DataFrame({'A': [0, 1, 1, 2, 0],
                    'B': ['a', 'b', 'b', 'c', 'a'],
                    'C': [np.nan, 3, 3, None, np.nan]})

    if subset is None:
        subset = list(df.columns)
    elif isinstance(subset, str):
        # need to have a DataFrame, not a Series
        # -> select columns with singleton list, not string
        subset = [subset]

    expected = df[subset].duplicated(keep=keep)
    result = df.duplicated(keep=keep, subset=subset)
    tm.assert_series_equal(result, expected)
    def calculate(df: pd.DataFrame):
        is_duplicated = df.duplicated(subset=['ip', 'device', 'os', 'channel', 'app', 'click_time'], keep=False)
        features = np.zeros(len(df))
        features[~is_duplicated] = np.nan

        curr_start_index = None
        prev_columns = None
        dup_df = df[is_duplicated]
        dup_rows = zip(dup_df.ip, dup_df.device, dup_df.os, dup_df.channel, dup_df.app, dup_df.click_time)
        for index, curr_columns in zip(dup_df.index, zip(dup_rows)):
            if prev_columns != curr_columns:
                curr_start_index = index
            features[index] = index - curr_start_index
            prev_columns = curr_columns
        df['DuplicateRowIndexDiff'] = features
        return df[['DuplicateRowIndexDiff']]
Пример #26
0
    def __remove_duplicates__(df: pd.DataFrame, subset: Union[List[str], str] = None) -> pd.DataFrame:
        """Removes duplicated rows from `df`

        :param df: A pandas DataFrame
        :param subset: Column name/s to identify duplicates. If it is `None` all columns will be used
        :return: The DataFrame with no duplicated rows
        """
        if subset is None:
            subset = df.columns
        else:
            Transform.__guard_against_non_existent_columns__(df, subset)

        if df.duplicated(subset).sum() > 0:
            df.drop_duplicates(subset, inplace=True, keep='last')

        return df
Пример #27
0
def dup_rows(data: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """Get duplicate rows.

    Parameters
    ----------
    data : DataFrame
        Data for getting duplicate rows.
    **kwargs : dict, optional
        Extra arguments to `DataFrame.duplicated`. Refer to Pandas
        documentation for all possible arguments.
    Returns
    -------
    DataFrame
        Table of duplicate rows.
    """
    return data.loc[data.duplicated(**kwargs)].copy()
Пример #28
0
    def _general_dict(self, scenario, bohb_parallel=False):
        """ Generate the meta-information that holds for all runs (scenario info etc)

        Parameters
        ----------
        scenario: smac.Scenario
            scenario file to get information from
        bohb_parallel: Union[False, int]
            if set, defines number of parallel runs
        """
        # general stores information that holds for all runs, runspec holds information on a run-basis
        general = OrderedDict()

        # TODO with multiple BOHB-run-integration
        #    overview['Run with best incumbent'] = os.path.basename(best_run.folder)
        #if num_conf_runs != 1:
        #    overview['Number of configurator runs'] = num_conf_runs

        self.logger.debug("bohb_parallel in overview: %s", bohb_parallel)
        if bohb_parallel:
            general['# aggregated parallel BOHB runs'] = bohb_parallel

        # Scenario related
        general['# parameters'] = len(scenario.cs.get_hyperparameters())
        general['Deterministic target algorithm'] = scenario.deterministic
        general['Optimized run objective'] = scenario.run_obj
        if scenario.cutoff or scenario.run_obj == 'runtime':
            general['Cutoff'] = scenario.cutoff
        if any([str(lim)!='inf' for lim in [scenario.wallclock_limit, scenario.ta_run_limit, scenario.algo_runs_timelimit]]):
            general['Walltime budget'] = scenario.wallclock_limit
            general['Runcount budget'] = scenario.ta_run_limit
            general['CPU budget'] = scenario.algo_runs_timelimit
        # Instances
        num_train, num_test = [len([i for i in insts if i]) for insts in [scenario.train_insts, scenario.test_insts]]
        if num_train > 0 or num_test > 0:
            general['# instances (train/test)'] = "{} / {}".format(num_train, num_test)
        # Features
        num_feats = scenario.n_features if scenario.feature_dict else 0
        num_dup_feats = 0
        if scenario.feature_dict:
            dup_feats = DataFrame(scenario.feature_array)
            num_dup_feats = len(dup_feats[dup_feats.duplicated()])  # only contains train instances
        if num_feats > 0:
            general['# features (duplicates)'] = "{} ({})".format(num_feats, num_dup_feats)

        return general
Пример #29
0
def extract_duplicates(data: DataFrame,
                       duplicate_columns: List[str],
                       index_columns: List[str],
                       fill_na='NA') -> DataFrame:
    # without filling nulls we would get false negatives as nan != nan in Python
    if fill_na:
        data = data.fillna(fill_na)
    # pre-filter to operate only on duplicates
    data = data[duplicate_columns + index_columns]
    data = data[data.duplicated(keep=False, subset=duplicate_columns)]
    by_values = (data.reset_index().groupby(duplicate_columns)
                 [index_columns].apply(to_nested_series))
    if by_values.empty:
        return DataFrame()
    some_index_column = by_values[index_columns[0]]
    df = by_values[some_index_column.apply(len) > 1].reset_index(drop=True)
    return explode_rows_with_lists(df)
    def rows(self):
        from pandas import read_pickle, DataFrame, merge, concat

        connection = self.output().connect()
        cursor = connection.cursor()

        sql = f"""
               SELECT {', '.join(['id'] + list(self.columns))}
               FROM {self.table};
               """

        cursor.execute(sql)
        results = cursor.fetchall()

        current_df = DataFrame(results, columns=['id'] + list(self.columns))

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if not df.empty:

            # get list of dim values that are already in the database, but have
            # changed their attributes
            merged = merge(current_df, df, on=self.columns, how='inner')
            current_df = concat([current_df, merged], axis=0)
            is_duplicate = current_df.duplicated(keep=False)

            # duplicates = current_df[is_duplicate]
            new = current_df[~is_duplicate]

            to_delete = new['id'].tolist()
            to_copy = df[df[list(self.id_cols)].isin(new[list(
                self.id_cols)].to_dict(orient='list')).all(axis=1)]

            to_copy = to_copy[list(self.columns)]

        delete_sql = f"""
                      DELETE FROM {self.table}
                      WHERE id IN ({', '.join(to_delete)});
                      """

        cursor.execute(delete_sql)

        for index, line in to_copy.iterrows():  # returns (index, Series) tuple
            yield line.values.tolist()
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid):
    dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])]
    # print(dfm_duplicated)
    dfm_stk_info.drop_duplicates('股权登记日',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度'])
        dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案']
        if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E':
            if row['方案文本解析错误标识位'] == 'E':
                dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E'
                dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None
                dfm_stk_info.loc[index, '转增(股)/10股'] = None
                dfm_stk_info.loc[index, '送股(股)/10股'] = None
            else:
                dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股'])
                dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股'])
                dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股'])
        logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
Пример #32
0
def get_duplicates(df: pd.DataFrame,
                   supported_columns) -> Optional[pd.DataFrame]:
    """Obtain the most occurring duplicate rows in the DataFrame.

    Args:
        df: the Pandas DataFrame.
        supported_columns: the columns to consider

    Returns:
        A subset of the DataFrame, ordered by occurrence.
    """
    n_head = config["duplicates"]["head"].get(int)

    if n_head > 0 and supported_columns:
        return (df[df.duplicated(
            subset=supported_columns,
            keep=False)].groupby(supported_columns).size().reset_index(
                name="count").nlargest(n_head, "count"))
    return None
Пример #33
0
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
    """ Provides information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped
    """

    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index")

    return data, dupl_rows
Пример #34
0
def get_duplicates(
        df: pd.DataFrame,
        supported_columns) -> Tuple[Dict[str, Any], Optional[pd.DataFrame]]:
    """Obtain the most occurring duplicate rows in the DataFrame.

    Args:
        df: the Pandas DataFrame.
        supported_columns: the columns to consider

    Returns:
        A subset of the DataFrame, ordered by occurrence.
    """
    n_head = config["duplicates"]["head"].get(int)

    metrics: Dict[str, Any] = {}
    if n_head > 0:
        if supported_columns and len(df) > 0:
            duplicates_key = config["duplicates"]["key"].get(str)
            if duplicates_key in df.columns:
                raise ValueError(
                    f"Duplicates key ({duplicates_key}) may not be part of the DataFrame. Either change the "
                    f" column name in the DataFrame or change the 'duplicates.key' parameter."
                )

            duplicated_rows = df.duplicated(subset=supported_columns,
                                            keep=False)
            duplicated_rows = (df[duplicated_rows].groupby(
                supported_columns).size().reset_index(name=duplicates_key))

            metrics["n_duplicates"] = len(duplicated_rows[duplicates_key])
            metrics["p_duplicates"] = metrics["n_duplicates"] / len(df)

            return (
                metrics,
                duplicated_rows.nlargest(n_head, duplicates_key),
            )
        else:
            metrics["n_duplicates"] = 0
            metrics["p_duplicates"] = 0.0
            return metrics, None
    else:
        return metrics, None
Пример #35
0
    def load_recordings(self, databases):
        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')

        recordings = DataFrame()
        for database in databases:
            path = op.join(data_dir, 'KEYS', '{db}.key'.format(db=database))
            local_keys = read_table(path, delim_whitespace=True, names=FIELDS)
            recordings = recordings.append(local_keys)

        # remove duplicates
        recordings = recordings[~recordings.duplicated()]

        # index using unique recording name
        recordings = recordings.set_index('recording')

        # translate channels (a --> 1, b --> 2, x --> 1)
        func = lambda channel: {'a': 1, 'b': 2, 'x': 1}[channel]
        recordings['channel'] = recordings['channel'].apply(func)

        return recordings
Пример #36
0
    def project_state_assets(self, df: pd.DataFrame,
                             table: SQLTable) -> pd.DataFrame:
        self._log.debug(f"Cleaning {whoami()}")

        if self.column_check(df.columns.tolist(), table):
            df = df.replace("-", 0)
            float_cols = get_column_types(table, Float)
            date_cols = get_column_types(table, Date)
            df = df.pipe(ccast, (float_cols, tf_net_acres)).pipe(
                ccast, (date_cols, tf_date))
            pk = table.primary_key.columns.keys()
            duplicates = df.duplicated(pk)
            dup_idx = ", ".join(
                df[duplicates].index.astype("unicode").tolist())
            if np.all(duplicates):
                message_box(
                    f"Duplicate assets found at {dup_idx}. {', '.join(pk)} combination should be unique."
                )

            return df
Пример #37
0
def split_jhu_state_data(
        df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:

    state = df[df.duplicated(['state'])]
    state = state.groupby('state').sum()
    state = state.reset_index()
    state.latitude = 0.0
    state.longitude = 0.0

    geocoder = Nominatim(timeout=60)

    lat, lon = zip(*[
        pandemics.utils.geocode(geocoder, f'{state}, United States')
        for state in state.state
    ])

    state.latitude = lat
    state.longitude = lon

    state = state.drop(columns=['fips'])

    return df, state
Пример #38
0
def find_by(df: pd.DataFrame, columns: List[str]) -> Result:
    """Compare items rows in `df` by `columns`

    Returns:
        Any duplicates
    """
    result = Result(f"Duplicates")
    result.items_count = len(df)
    df = df.dropna(subset=columns, how="all")
    duplicates = df[df.duplicated(columns, keep=False)][columns]
    if duplicates.empty:
        return result

    errors = {}
    for _, d in duplicates.groupby(columns):
        msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns]
        errors[f"same {', '.join(msgs)}"] = list(d.index)

    result.add_error(
        f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}",
        errors=errors)
    return result
Пример #39
0
def fix_raw_results(raw_df: pd.DataFrame):
    """
    Fixes some issues with the raw results
    """

    # There's some redundancy with the raw results so we'll remove duplicated
    # results
    raw_df = raw_df[~raw_df.duplicated()]

    # There might also be some differences between run numbers between
    # experiments
    uniq_ids = raw_df.id.unique()
    n = len(uniq_ids)
    id_dfs = [pd.DataFrame()] * n
    for (i, id_val) in enumerate(uniq_ids):
        tmp_df = raw_df[raw_df['id'] == id_val].groupby('metric',
                                                        as_index=False).mean()
        tmp_df['id'] = id_val
        id_dfs[i] = tmp_df

    df = pd.concat(id_dfs, ignore_index=True)
    df = df.reset_index(drop=True)
    return df
def clean_dict(df: DataFrame, column):
    print("Preprocessing data text - cleanining")
    #drop duplicates
    d = df.duplicated(column, keep='first')
    print("removing duplicate sentences. Duplicates = {} senteces".format(
        len(df[d][column])))
    #lowercase
    df[column] = df[column].str.lower()
    print("len before removing: {}".format(len(df[column])))
    df.drop_duplicates(subset=column, inplace=True, keep='first')
    print("len after removing: {}".format(len(df[column])))
    stop = stopwords.words('english')
    for i, row in df.iterrows():
        sentence = row[column]
        #remove http url
        sentence = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',
                          '', sentence)
        #handle emojis before html
        sentence = convert_emojis(sentence)
        sentence = convert_emoticons(sentence)
        #remove html
        soup = BeautifulSoup(sentence, "html.parser")
        sentence = soup.get_text()

        #remove common stop words - might have to use it earlier because some stop words use symbols xes: isn't
        sentence = " ".join(x for x in sentence.split() if x not in stop)
        #remove punctuation
        sentence = sentence.translate(str.maketrans('', '',
                                                    string.punctuation))

        #stemming?lemmatize?
        sentence = lemmatize_words(sentence)
        #spelling correction? textblob or pyspellchecker - problem: slang and not english words will be transformed in other words.

        df.at[i, column] = sentence
    print(df[column].head())
    return df
Пример #41
0
def clean_trades_df(df: pd.DataFrame) -> pd.DataFrame:
    # get origional number of ticks
    og_tick_count = df.shape[0]
    # drop irrgular trade conditions
    df = df.loc[df.irregular==False]
    # drop trades with >1sec timestamp diff
    dt_diff = (df.sip_dt - df.exchange_dt)
    df = df.loc[dt_diff < pd.to_timedelta(1, unit='S')]
    # add median filter and remove outlier trades
    df = median_outlier_filter(df)
    # remove duplicate trades
    num_dups = sum(df.duplicated(subset=['sip_dt', 'exchange_dt', 'sequence', 'trade_id', 'price', 'size']))
    if num_dups > 0: 
        print(num_dups, 'duplicated trade removed')
        df = df.drop_duplicates(subset=['sip_dt', 'exchange_dt', 'sequence', 'trade_id', 'price', 'size'])
    # drop trades with zero size/volume
    df = df.loc[df['size'] > 0]
    droped_rows = og_tick_count - df.shape[0]
    print('dropped', droped_rows, 'ticks (', round((droped_rows / og_tick_count) * 100, 2), '%)')
    # sort df
    df = df.sort_values(['sip_dt', 'exchange_dt', 'sequence'])
    # small cols subset
    df = df[['sip_dt', 'price', 'size']]
    return df.rename(columns={'sip_dt': 'date_time', 'size': 'volume'}).reset_index(drop=True)    
Пример #42
0
def check_primary_key(
    df: pd.DataFrame,
    primaryKey: Union[str, List[str]],
    skip_required: bool = False,
    skip_single: bool = False,
) -> List[Union[ConstraintError, PrimaryKeyError]]:
    """
    Check table primary key.

    Arguments:
        df: Table.
        primaryKey: Primary key field names.
        skip_required: Whether to not check for missing values in primary key fields.
        skip_single: Whether to not check for duplicates if primary key is one field.

    Returns:
        A list of errors.
    """
    errors = []
    key = _as_list(primaryKey)
    if key:
        if not skip_required:
            for name in key:
                errors += check_field_constraints(df[name],
                                                  required=True,
                                                  field=dict(name=name))
        if skip_single and len(key) < 2:
            return errors
        invalid = df.duplicated(subset=key)
        if invalid.any():
            errors.append(
                PrimaryKeyError(
                    primaryKey=key,
                    values=df[key][invalid].drop_duplicates().values.tolist(),
                ))
    return errors
Пример #43
0
pivoted = ldata.pivot('date', 'item')
pivoted[:5]

pivoted['value'][:5]

unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]


###移除重复数据
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

data.duplicated()

data.drop_duplicates()

data['v1'] = range(7)
data.drop_duplicates(['k1'])

data.drop_duplicates(['k1', 'k2'], take_last=True)


###利用函数或映射进行数据转换
#1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
Пример #44
0
def test_drop_duplicates():
    df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                            'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('AAA')
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep='last')
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep=False)
    expected = df.loc[[]]
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates(np.array(['AAA', 'B']))
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep='last')
    expected = df.loc[[0, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep=False)
    expected = df.loc[[0]]
    tm.assert_frame_equal(result, expected)

    # consider everything
    df2 = df.loc[:, ['AAA', 'B', 'C']]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep='last')
    expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
    tm.assert_frame_equal(result, expected)

    # integers
    result = df.drop_duplicates('C')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('C', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    df['E'] = df['C'].astype('int8')
    result = df.drop_duplicates('E')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('E', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    # GH 11376
    df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
                    'y': [0, 6, 5, 5, 9, 1, 2]})
    expected = df.loc[df.index != 3]
    tm.assert_frame_equal(df.drop_duplicates(), expected)

    df = DataFrame([[1, 0], [0, 2]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-2, 0], [0, -4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    x = np.iinfo(np.int64).max / 3 * 2
    df = DataFrame([[-x, x], [0, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-x, x], [x, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    # GH 11864
    df = DataFrame([i] * 9 for i in range(16))
    df = df.append([[1] + [0] * 8], ignore_index=True)

    for keep in ['first', 'last', False]:
        assert df.duplicated(keep=keep).sum() == 0
Пример #45
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 17 09:44:35 2017

@author: HanKin
"""

from pandas import Series, DataFrame  
  
data = DataFrame({'k': [1, 1, 2, 2],'y':[2,2,4,1]})  
  
print(data)  
  
IsDuplicated = data.duplicated()  
  
print(IsDuplicated)  
print(type(IsDuplicated)) 
  
data = data.drop_duplicates()  
print(data)  
Пример #46
0
f3 = pd.read_csv('rcs/macrodata.csv')
periods = pd.PeriodIndex(year=f3.year, quarter=f3.quarter, name='date')
f3 = DataFrame(f3.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))
ldata = f3.stack().reset_index().rename(columns={0: 'value'})
wdata = ldata.pivot('date', 'item', 'value')
# print ldata
# print wdata

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
# 去除重复值
# data.duplicated()会返回一个Bool的Series,表示各行是否是重复值
s1 = data.duplicated()
f4 = data.ix[np.logical_not(s1),]
# print f4
# drop_dumplicates会直接返回一个除去重复值的DataFrame
f5 = data.drop_duplicates()
# print f5
# 指定按某列过滤,保留的值为最后一个
f6 = data.drop_duplicates(['k1'], keep='last')
# print f6


# 给DataFrame添加一列,使其对应,map可以替换原Series对应的数据
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
Пример #47
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# DUPLICATED VALUES -------------------------

## create new dataframe
zip3 = zip(['red', 'green', 'blue', 'orange']*3, [5, 10, 20, 40]*3, 
                [':(', ':D', ':D']*4)
                
df3 = DataFrame(zip3, columns = ['A', 'B', 'C'])

## pandas method `duplicated`
df3.duplicated() # searching from top to bottom by default
df3.duplicated(take_last = True) # searches bottom to top

## subset duplicated values
df3.duplicated(subset = ['A', 'B'])
df3.duplicated(['A', 'B'])

## HOW to get all values that have duplicates (purging)
t_b = df3.duplicated()
b_t = df3.duplicated(take_last = True)
unique = ~(t_b | b_t) # complement where either is true
unique = ~t_b & ~b_t
unique

df3[unique]

# DROPPING DUPLICATES --------------------------------------------
df3.drop_duplicates()
Пример #48
0
from pandas.io.parsers import TextParser
from numpy import NaN as NA
from lxml.html import parse
from urllib.request import urlopen
from lxml import objectify
from io import StringIO

###############################################################

data = DataFrame({'k1':['one'] * 3 + ['two'] * 4,
                  'k2':[1,1,2,3,3,4,4]})

print(data)
print('\n')

print(data.duplicated())
print('\n')

print(data.drop_duplicates())
print('\n')

data['v1'] = range(7)
print(data.drop_duplicates(['k1']))
print('\n')

print(data.drop_duplicates(['k1','k2'], take_last=True))
print('\n')

data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})
print(data)
# -*- coding: utf-8 -*-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                       重复数据处理   
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#%%
import pandas as pd
from pandas import Series,DataFrame
from string import letters
d1 = DataFrame({'a':['a','b']*6,'b':[1,2,3,4,5,6]*2,'c':[1,3,5]*4})

#%% 列出重复记录
d1.duplicated()
#%% 选择非重复记录
d1[d1.duplicated()==False]
#%% 按列计算重复
d1.duplicated('a')
#%% 按两个以上列
d1.duplicated(['a','c'])
#%% 保留最后一个元素
d1.duplicated('a',take_last=True)

#%% 删除重复 
# drop_duplicates() 等效于执行 d1[d1.duplicated()==False]
d1.drop_duplicates()

Пример #50
0
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Пример #51
0
def test_duplicated_nan_none(keep, expected):
    df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)

    result = df.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Пример #52
0
b[-1] = np.nan
# np中实现ifelse语句,a中空值位置用b替代
np.where(pd.isnull(a),b,a)
# pd中类似函数,b中控制用a替代
b[:-2].combine_first(a[2:])
# DataFrame中使用
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
	'b':[np.nan,2.,np.nan, 6.],
	'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
	'b':[np.nan,3.,4,6.,8.]})
df1.combine_first(df2)
## 移除重复数据
data = DataFrame({'k1':['one']*3+['two']*4,
	'k2':[1,1,2,3,3,4,4]})
data.duplicated()
# 去除重复值,默认留第一个
data.drop_duplicates()
# 根据某一列去除重复值
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# 保留最后一个
data.drop_duplicates(['k1','k2'], take_last=True)

## 利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
	'corned beef','Bacon','pastrami','honey ham','nova lox'],
	'ounces':[4,3,12,6,7.5,8,3,5,6]})
meat_to_animal = {
	'bacon':'pig',
	'pulled pork':'pig',
Пример #53
0
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
# dfMerged = pd.merge(df1, df2, on='key')
# print dfMerged
# dfMergedOuter = pd.merge(df1, df2, how='outer')
# print dfMergedOuter

df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
# dfMerged = pd.merge(df3, df4, left_on='lkey', right_on='rkey')
# print dfMerged

left = DataFrame({'key1':['foo', 'foo', 'bar'], 'key2':['one', 'foo', 'one'], 'lval':[1, 2, 3]})

right = DataFrame({'key1':['foo', 'foo', 'bar', 'bar'], 'key2':['one', 'foo', 'one', 'one'], 'rval':[4, 5, 6, 7]})

dfMergedOuter = pd.merge(left, right, how='outer')
# print dfMergedOuter

arr = np.arange(12).reshape((6,2))
# print arr
# print np.arange(12)
arrConcat = np.concatenate([arr, arr], axis = 1)
# print arrConcat

data = DataFrame({'k1': ['one']*3 + ['two']*4, 'k2': [1,1,2,3,3,4,4]})
# print data
dataDuplicate = data.duplicated()
print dataDuplicate
dropDuplicate = data.drop_duplicates()
print dropDuplicate
Пример #54
0
def test_duplicated_keep(keep, expected):
    df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})

    result = df.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Пример #55
0
match = regex_email.findall(email)


df = DataFrame(match)

# ----------------------------------------cours sur pandas

# df tableau des élèves avec @mail
# Dataframe.rename pour renommer les colonnes d'un DF
df = df.rename (columns = {'ancien' : 'nouveau', 'ancien' : 'nouveau'})

# on peut aussi faire un lambda pour changer les index de lignes
df.index = df.index.map(lambda x: 'Eleve ' + str(x))

# on peut supprimer les doublons
df.duplicated() #pour voir s'il y a des doublons
df.drop_duplicate() # pour les supprimer

# -------------------------- traiter le fichier aliments.csv
aliments = pd.read_csv('aliments.csv')

# realiser une matrice d'aliments x traces contenues
aliments['traces]'].isnull()
aliments_with_traces = aliments['traces'].dropna()

traces_iter = (set(x.split(',')) for x in aliments_with_traces['traces'])
traces = set.union(traces_iter)

DataFrame(np.zeros((len(aliments_with_traces), len(traces))), columns= traces)
for i, tr in enumerate(aliments_with_traces.traces):
	dummies.ix[i, tr.split(',')] = 1
Пример #56
0
info = DataFrame(db,  columns=info_keys)
print info[:5]
print info

print pd.value_counts(info.group)[:10]

nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)
print nutrients
print nutrients.duplicated().sum()

nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food',
               'group': 'fgroup'}

info = info.rename(columns=col_mapping, copy=False)
print info

col_mapping = {'description': 'nutrient',
               'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print nutrients

ndata = pd.merge(nutrients, info, on='id', how='outer')
Пример #57
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Пример #58
0
nutrients = DataFrame(db[0]['nutrients'])
print(nutrients[:7])

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
print(pd.value_counts(info.group)[:10])

nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
print(nutrients[:10])
print(nutrients.duplicated().sum())

nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food',
               'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info[:10])

col_mapping = {
    'description': 'nutrient',
    'group': 'nutgroup'
}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients[:10])
Пример #59
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np


# DUPLICATED VALUES -----------------------------------

## create a new data frame
zip3 = zip(['red', 'green', 'blue', 'orange']*4, [5, 10, 20, 40]*3, [':(', ':D', ':D']*4)
df3 = DataFrame(zip3, columns = ['A', 'B', 'C'])
df3

## returns boolean vector of duplicated rows of a whole DataFrame or subset using method `duplicated`
## IMPORTANT: python, by default, searches for duplicated values from top-to-bottom
## and will not mark a row as "duplicated" until it actually finds another instance
df3.duplicated() # defaults using all rows searching top-to-bottom
df3.duplicated(take_last = True) # option `take_last = True` searches bottom-to-top

## SUBSET duplicates
# if we want the duplicated criteria to be of a subset, we can do that too
df3.duplicated(subset = ['A', 'B'])
df3.duplicated(['A', 'B']) # same as before


## HOW to get all values that have a duplicate
t_b = df3.duplicated()
b_t = df3.duplicated(take_last = True)
unique = ~(t_b | b_t) # negate where either is true
unique
unique = ~t_b & ~b_t # same as above
unique