示例#1
0
 def test_nonvectorized_math_apply_on_small_dataframe(self):
     LOG.info("test_nonvectorized_math_apply_on_small_dataframe")
     df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)})
     tqdm.pandas(desc="Pandas Nonvec math apply ~ DF")
     pd_val = df.progress_apply(math_agg_foo)
     swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_agg_foo)
     self.assertEqual(pd_val, swifter_val)  # equality test
示例#2
0
 def _dask_apply(self, func, *args, **kwds):
     try:
         # check that the dask rolling apply matches the pandas apply
         with suppress_stdout_stderr():
             tmp_df = (
                 dd.from_pandas(self._sample_original, npartitions=self._npartitions)
                 .rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]})
                 .apply(func, *args, **kwds)
                 .compute(scheduler=self._scheduler)
             )
         self._validate_apply(
             tmp_df.equals(self._sample_pd.apply(func, *args, **kwds)),
             error_message="Dask rolling apply sample does not match pandas rolling apply sample.",
         )
         if self._progress_bar:
             with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
                 return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
         else:
             return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
     except (AttributeError, ValueError, TypeError, KeyError):
         if self._progress_bar:
             tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
             return self._obj_pd.progress_apply(func, *args, **kwds)
         else:
             return self._obj_pd.apply(func, *args, **kwds)
示例#3
0
def add_calendar_features(df, use_lunar=True, use_holiday=True):
    """
    Thêm các feature giờ, thứ, tháng, ngày nghỉ lễ, lịch âm, v.v. vào dataframe
    """
    df['Hour'] = df.index.to_series().dt.hour
    df['DayOfWeek'] = df.index.to_series().dt.dayofweek
    df['Month'] = df.index.to_series().dt.month
    df['DayOfYear'] = df.index.to_series().dt.dayofyear
    df['Weekend'] = df['DayOfWeek'].isin([5,
                                          6]).astype(int)  # Saturday or Sunday

    if use_lunar or use_holiday:
        tqdm.pandas()

    if use_lunar:
        lunar_features = df.index.to_series().progress_apply(
            get_lunar_calendar_features)
        df = df.merge(lunar_features, left_index=True, right_index=True)
        df['LeapMonth'] = df['LeapMonth'].astype(int)

    if use_holiday:
        df['Holiday'] = df.index.to_series().progress_apply(
            get_holiday)  # Running this will take a lot of time
        df['IsHoliday'] = (df['Holiday'] != 'No').astype(int)
    return df
示例#4
0
    def apply(self, func, convert_dtype=True, args=(), **kwds):
        """
        Apply the function to the Series using swifter
        """

        # if the series is empty, return early using Pandas
        if not self._nrows:
            return self._obj.apply(func,
                                   convert_dtype=convert_dtype,
                                   args=args,
                                   **kwds)

        sample = self._obj.iloc[:self._npartitions * 2]
        # check if input is string or if the user is overriding the string processing default
        allow_dask_processing = True if self._allow_dask_on_strings else (
            sample.dtype != "object")

        if "axis" in kwds.keys():
            kwds.pop("axis")
            warnings.warn(
                "Axis keyword not necessary because applying on a Series.")

        try:  # try to vectorize
            with suppress_stdout_stderr_logging():
                tmp_df = func(sample, *args, **kwds)
                sample_df = sample.apply(func,
                                         convert_dtype=convert_dtype,
                                         args=args,
                                         **kwds)
                self._validate_apply(
                    np.array_equal(sample_df, tmp_df) &
                    (sample_df.shape == tmp_df.shape),
                    error_message=
                    "Vectorized function sample doesn't match pandas apply sample.",
                )
            return func(self._obj, *args, **kwds)
        except ERRORS_TO_HANDLE:  # if can't vectorize, estimate time to pandas apply
            wrapped = self._wrapped_apply(func,
                                          convert_dtype=convert_dtype,
                                          args=args,
                                          **kwds)
            timed = timeit.timeit(wrapped, number=N_REPEATS)
            sample_proc_est = timed / N_REPEATS
            est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[
                0]

            # if pandas sample apply takes too long and not performing str processing, use dask
            if (est_apply_duration >
                    self._dask_threshold) and allow_dask_processing:
                return self._dask_apply(func, convert_dtype, *args, **kwds)
            else:  # use pandas
                if self._progress_bar:
                    tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                    return self._obj.progress_apply(
                        func, convert_dtype=convert_dtype, args=args, **kwds)
                else:
                    return self._obj.apply(func,
                                           convert_dtype=convert_dtype,
                                           args=args,
                                           **kwds)
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()
        df = df.sort_values(['user_id', 'session_id', 'timestamp',
                             'step']).reset_index()

        # find the last clickout rows
        last_clickout_idxs = find_last_clickout_indices(df)
        clickout_rows = df.loc[
            last_clickout_idxs,
            ['user_id', 'session_id', 'impressions', 'index']]
        clickout_rows[
            'impressions_count'] = clickout_rows.impressions.str.split(
                '|').str.len()
        clickout_rows = clickout_rows.drop('impressions', axis=1)

        # multi-hot the counts
        one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8)
        for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)):
            one_hot_counts[i, 0:c] = 1

        # add to the clickouts
        for i in range(25):
            clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i]

        return clickout_rows.drop('impressions_count',
                                  axis=1).set_index('index')
示例#6
0
    def _dask_applymap(self, func):
        sample = self._obj.iloc[:self._npartitions * 2, :]
        with suppress_stdout_stderr_logging():
            meta = sample.applymap(func)
        try:
            with suppress_stdout_stderr_logging():
                # check that the dask apply matches the pandas apply
                tmp_df = (dd.from_pandas(
                    sample, npartitions=self._npartitions).applymap(
                        func, meta=meta).compute(scheduler=self._scheduler))
                self._validate_apply(
                    tmp_df.equals(meta),
                    error_message=
                    "Dask applymap sample does not match pandas applymap sample."
                )
            if self._progress_bar:
                with TQDMDaskProgressBar(
                        desc=self._progress_bar_desc or "Dask Applymap"):
                    return (dd.from_pandas(
                        self._obj, npartitions=self._npartitions).applymap(
                            func,
                            meta=meta).compute(scheduler=self._scheduler))
            else:
                return (dd.from_pandas(
                    self._obj, npartitions=self._npartitions).applymap(
                        func, meta=meta).compute(scheduler=self._scheduler))
        except ERRORS_TO_HANDLE:
            # if dask apply doesn't match pandas apply, fallback to pandas
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                applymap_func = self._obj.progress_applymap
            else:
                applymap_func = self._obj.applymap

            return applymap_func(func)
示例#7
0
 def _dask_apply(self, func, *args, **kwds):
     try:
         # check that the dask rolling apply matches the pandas apply
         with suppress_stdout_stderr_logging():
             tmp_df = (dd.from_pandas(
                 self._comparison_pd,
                 npartitions=self._npartitions).rolling(
                     **{
                         k: v
                         for k, v in self._rolling_kwds.items()
                         if k not in ["on", "closed"]
                     }).apply(func, *args,
                              **kwds).compute(scheduler=self._scheduler))
             self._validate_apply(
                 tmp_df.equals(
                     self._comparison_pd.rolling(
                         **self._rolling_kwds).apply(func, *args, **kwds)),
                 error_message=("Dask rolling apply sample does not match "
                                "pandas rolling apply sample."),
             )
         if self._progress_bar:
             with TQDMDaskProgressBar(
                     desc=self._progress_bar_desc or "Dask Apply"):
                 return self._obj_dd.apply(
                     func, *args, **kwds).compute(scheduler=self._scheduler)
         else:
             return self._obj_dd.apply(
                 func, *args, **kwds).compute(scheduler=self._scheduler)
     except ERRORS_TO_HANDLE:
         if self._progress_bar:
             tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
             return self._obj_pd.progress_apply(func, *args, **kwds)
         else:
             return self._obj_pd.apply(func, *args, **kwds)
示例#8
0
    def test_nonvectorized_math_apply_on_large_dataframe_broadcast(self):
        LOG.info("test_nonvectorized_math_apply_on_large_dataframe_broadcast")
        df = pd.DataFrame({
            "x": np.random.normal(size=250_000),
            "y": np.random.uniform(size=250_000)
        })

        tqdm.pandas(desc="Pandas Nonvec math apply + broadcast ~ DF")
        start_pd = time.time()
        pd_val = df.progress_apply(math_agg_foo,
                                   axis=1,
                                   result_type="broadcast")
        end_pd = time.time()
        pd_time = end_pd - start_pd

        start_swifter = time.time()
        swifter_val = (df.swifter.set_npartitions(4).progress_bar(
            desc="Nonvec math apply + broadcast ~ DF").apply(
                math_agg_foo, axis=1, result_type="broadcast"))
        end_swifter = time.time()
        swifter_time = end_swifter - start_swifter

        self.assertEqual(pd_val, swifter_val)  # equality test
        if self.ncores > 1:  # speed test
            self.assertLess(swifter_time, pd_time)
示例#9
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        # find the clickout interactions
        res_df = df[['user_id','session_id','prices']]
        res_df = res_df[df.action_type == 'clickout item']

        # expand the prices as vector
        expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int')

        # scale log
        log_prices = np.log(expanded_prices +1)

        max_price = max(np.max(log_prices))
        min_price = min(np.min(log_prices))

        log_prices = (log_prices - min_price) / (max_price - min_price)

        # add the prices to the resulting df
        for i in range(25):
            res_df['price_{}'.format(i)] = log_prices.loc[:, i]
        
        return res_df.drop(['user_id','session_id','prices'], axis=1)
示例#10
0
    def test_nonvectorized_text_modin_apply_on_large_dataframe(self):
        LOG.info("test_nonvectorized_text_modin_apply_on_large_dataframe")
        df = pd.DataFrame({
            "letter": ["I", "You", "We"] * 1_000_000,
            "value": ["want to break free"] * 3_000_000
        })

        tqdm.pandas(desc="Pandas Nonvec text apply ~ DF")
        start_pd = time.time()
        pd_val = df.progress_apply(clean_text_foo, axis=1)
        end_pd = time.time()
        pd_time = end_pd - start_pd

        start_swifter = time.time()
        swifter_val = (df.swifter.allow_dask_on_strings(False).set_npartitions(
            4).set_ray_compute(num_cpus=2 if self.ncores >= 2 else 1,
                               memory=0.25).progress_bar(
                                   desc="Nonvec Modin text apply ~ DF").apply(
                                       clean_text_foo, axis=1))
        end_swifter = time.time()
        swifter_time = end_swifter - start_swifter

        self.assertEqual(pd_val, swifter_val)  # equality test
        if self.ncores > 1:  # speed test
            self.assertLess(swifter_time, pd_time)
示例#11
0
    def test_nonvectorized_text_modin_apply_on_large_dataframe_returns_series(
            self):
        LOG.info(
            "test_nonvectorized_text_modin_apply_on_large_dataframe_returns_series"
        )
        df = pd.DataFrame({"str_date": ["2000/01/01 00:00:00"] * 1_000_000})

        tqdm.pandas(desc="Pandas Nonvec text apply ~ DF -> Srs")
        start_pd = time.time()
        pd_val = df.progress_apply(lambda row: row["str_date"].split()[0],
                                   axis=1)
        end_pd = time.time()
        pd_time = end_pd - start_pd

        start_swifter = time.time()
        swifter_val = (df.swifter.allow_dask_on_strings(False).set_npartitions(
            4).set_ray_compute(
                num_cpus=2 if self.ncores >= 2 else 1,
                memory=0.25).progress_bar(
                    desc="Nonvec Modin text apply ~ DF -> Srs").apply(
                        lambda row: row["str_date"].split()[0], axis=1))
        end_swifter = time.time()
        swifter_time = end_swifter - start_swifter

        self.assertEqual(pd_val, swifter_val)  # equality test
        if self.ncores > 1:  # speed test
            self.assertLess(swifter_time, pd_time)
示例#12
0
 def test_nonvectorized_math_apply_on_small_series(self):
     LOG.info("test_nonvectorized_math_apply_on_small_series")
     df = pd.DataFrame({"x": np.random.normal(size=1000)})
     series = df["x"]
     tqdm.pandas(desc="Pandas Vec math apply ~ Series")
     pd_val = series.progress_apply(math_foo, compare_to=1)
     swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_foo, compare_to=1)
     self.assertEqual(pd_val, swifter_val)  # equality test
示例#13
0
def merge_speed_events(speed_df, events_df):
    tqdm.pandas()
    events_with_sensor_df = add_possible_sensors(events_df)
    #def in_range()
    events_with_sensor_df['sensors'] = events_with_sensor_df.progress_apply( \
        lambda row: [x for x in row.ROAD_SENSORS if row.KM_START <= x <= row.KM_END], axis=1)
    events_with_sensor_df = events_with_sensor_df[
        events_with_sensor_df['sensors'].str.len() > 0]
    return events_with_sensor_df.drop('ROAD_SENSORS', axis=1)
    def clean_tweets(self, col_name):
        print('Cleaning tweets...')
        st = time()
        tqdm.pandas()
        self.df_[col_name] = self.df_.progress_apply(
            lambda row: self.process_tweet(row[col_name]), axis=1)
        end = time()
        print('Finished in {0:.2f} minutes.'.format((end - st) / 60))

        return self
示例#15
0
    def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds):
        samp = self._obj.iloc[: self._npartitions * 2, :]
        meta = samp.apply(
            func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
        )
        try:
            if broadcast:
                result_type = "broadcast"
            elif reduce:
                result_type = "reduce"

            tmp_df = (
                dd.from_pandas(samp, npartitions=self._npartitions)
                .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                .compute(scheduler=self._scheduler)
            )
            assert tmp_df.equals(meta)
            if self._progress_bar:
                with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
                    return (
                        dd.from_pandas(self._obj, npartitions=self._npartitions)
                        .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                        .compute(scheduler=self._scheduler)
                    )
            else:
                return (
                    dd.from_pandas(self._obj, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
        except (AssertionError, AttributeError, ValueError, TypeError, KeyError):
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                return self._obj.progress_apply(
                    func,
                    axis=axis,
                    broadcast=broadcast,
                    raw=raw,
                    reduce=reduce,
                    result_type=result_type,
                    args=args,
                    **kwds
                )
            else:
                return self._obj.apply(
                    func,
                    axis=axis,
                    broadcast=broadcast,
                    raw=raw,
                    reduce=reduce,
                    result_type=result_type,
                    args=args,
                    **kwds
                )
示例#16
0
    def _modin_apply(self,
                     func,
                     axis=0,
                     raw=None,
                     result_type=None,
                     *args,
                     **kwds):
        sample = self._obj.iloc[:self._npartitions * 2, :]
        try:
            series = False
            with suppress_stdout_stderr_logging():
                import modin.pandas as md

                sample_df = sample.apply(func,
                                         axis=axis,
                                         raw=raw,
                                         result_type=result_type,
                                         args=args,
                                         **kwds)
                # check that the modin apply matches the pandas APPLY
                tmp_df = (md.DataFrame(sample).apply(func,
                                                     axis=axis,
                                                     raw=raw,
                                                     result_type=result_type,
                                                     args=args,
                                                     **kwds)._to_pandas())
                if isinstance(sample_df, pd.Series) and isinstance(
                        tmp_df, pd.DataFrame):
                    tmp_df = pd.Series(tmp_df.values[:, 0])
                    series = True
                self._validate_apply(
                    tmp_df.equals(sample_df),
                    error_message=
                    "Modin apply sample does not match pandas apply sample.")
            output_df = (md.DataFrame(self._obj).apply(func,
                                                       *args,
                                                       axis=axis,
                                                       raw=raw,
                                                       result_type=result_type,
                                                       **kwds)._to_pandas())
            return pd.Series(output_df.values[:, 0]) if series else output_df
        except ERRORS_TO_HANDLE:
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                apply_func = self._obj.progress_apply
            else:
                apply_func = self._obj.apply
            return apply_func(func,
                              axis=axis,
                              raw=raw,
                              result_type=result_type,
                              args=args,
                              **kwds)
示例#17
0
def value_overlap_matching(df, progress=True):
    """A schema matching method by calculating the similarities of link values.

    Args:
        df (pd.DataFrame): The dataframe where matching attributes are supposed 
            to be found.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with "value_overlap".
    """

    df = df.copy()

    # get column names, strip URIs from them & create a dictionary that maps between them
    old_colnames = [col for col in df.columns if re.findall("http:", col)]

    col_name_dict = {}

    for name in old_colnames:

        col_name_dict[re.sub(r"^.*http://", "http://", name)] = name

    new_colnames = [
        re.sub(r"^.*http://", "http://", col) for col in old_colnames
    ]

    # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame
    combinations = list(itertools.combinations(new_colnames, 2))
    combinations_sorted = [sorted(x) for x in combinations]

    # transform list into sorted DataFrame
    df_combinations = pd.DataFrame(combinations_sorted,
                                   columns=["uri_1", "uri_2"])
    df_combinations.sort_values(by="uri_1")

    # For each combination in this DataFrame, calculate the similarity of their values
    if progress:
        tqdm.pandas(desc="Value Overlap Matching: Calculate Value Overlaps")
        df_combinations["value_overlap"] = df_combinations.progress_apply(
            lambda x: get_value_overlap(df, col_name_dict, x["uri_1"], x[
                "uri_2"]),
            axis=1)
    else:
        df_combinations["value_overlap"] = df_combinations.apply(
            lambda x: get_value_overlap(df, col_name_dict, x["uri_1"], x[
                "uri_2"]),
            axis=1)

    return df_combinations
def main(args):
    nl2bash = pd.read_json(args.nl2bash).T

    graph = defaultdict(lambda: set())
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        tqdm.pandas(desc="Extracting utilities graph")
    nl2bash['cmd'].progress_apply(partial(update_graph, graph=graph))

    count_utilities = Counter()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        tqdm.pandas(desc="Extracting utilities from examples")
    nl2bash['cmd'].progress_apply(
        partial(add_utilities, counter=count_utilities))
    all_commands = list(bashlint.grammar.bg.grammar.keys())
    count_utilities.update(all_commands)

    commands = pd.DataFrame.from_dict(count_utilities, orient='index', columns=["count"]).reset_index() \
        .rename(columns={'index': 'cmd'}).sort_values('count').reset_index(drop=True)
    commands['required'] = commands['cmd'].apply(number_of_required_arguments)
    print(f"Found {len(commands)} total utilities")

    manpage = pd.read_json(args.manpage, lines=True)

    commands = commands.merge(manpage[['name', 'synopsis']],
                              left_on='cmd',
                              right_on='name',
                              how='left')
    commands.loc[commands['synopsis'].isna(), 'synopsis'] = ''

    alias_to_idx = defaultdict(lambda: [])

    def get_aliases(x):
        idx = x.name
        for y in x['aliases']:
            y = y[0]
            alias_to_idx[y].append(idx)

    manpage.apply(get_aliases, axis=1)

    commands.drop_duplicates(inplace=True)
    print(f"Now {len(commands)} utilities")

    commands['options'] = commands['cmd'].apply(
        partial(get_options, manpage=manpage, alias_to_idx=alias_to_idx))
    del manpage
    results = []
    for t in tqdm(range(args.size), desc="Generating examples"):
        results.append(list(generate_commands(commands, graph)))
    results = pd.DataFrame(results, columns=["cmd", "query"])
    results.to_csv(args.output, index=False)
示例#19
0
def colocalize_apply(gdf1, gdf2, progress=False):
    """colocalize gdf1 and gdf2
    
    return:
      2 pandas Index idx1 and idx2, of the same size. idx1 are colocated index from gdf1 that colocalize with idx2 from gdf2
      (note that index may not be unique if some are colocated more than once. 
    """
    if not sys.stderr.isatty() and "tqdm.std" in  str(tqdm):
        progress = False
    def row_coloc(gdf_item,gdf2,gdf_geometry_name='geometry'):
        timeok_gdf2 = gdf2[gdf2_date_interval.overlaps(gdf_item.date_interval__)]
            
        if hasattr(gdf_item,gdf1_geometry_name) and hasattr(gdf2,'geometry'):
            intersect_gdf2_ok = timeok_gdf2.contains(getattr(gdf_item,gdf1_geometry_name)) |  timeok_gdf2.intersects(getattr(gdf_item,gdf1_geometry_name)) | timeok_gdf2.geometry.within(getattr(gdf_item,gdf1_geometry_name))
        else:
            # if the user gave no geometry : all index
            intersect_gdf2_ok = slice(None)
            
        intersect_gdf2_idx = timeok_gdf2[intersect_gdf2_ok].index
        return timeok_gdf2[intersect_gdf2_ok].index
     
     
    if not 'date_interval__' in gdf1:
        gdf1['date_interval__'] =  pd.IntervalIndex.from_arrays(gdf1['startdate'],gdf1['stopdate'])
    else:
        drop1=False
    if 'date_interval__' in gdf2:
        gdf2_date_interval = pd.IntervalIndex(gdf2['date_interval__'])
    else:
        gdf2_date_interval = pd.IntervalIndex.from_arrays(gdf2['startdate'],gdf2['stopdate'])
 
        
    gdf1_geometry_name = gdf1.geometry.name
    gdf2_geometry_name = gdf2.geometry.name
    
    # empty index to store colocalization results
    idx1=gdf1.index.delete(slice(None))
    idx2=gdf2.index.delete(slice(None))
    
    if isinstance(gdf1.index,pd.MultiIndex):
        indexer1 = pd.MultiIndex.from_tuples
    else:
        indexer1 = pd.Index
    
    tqdm.pandas(disable = not progress, leave=False)
    gdf2_coloc_idx = gdf1.progress_apply(lambda row : row_coloc(row, gdf2,gdf_geometry_name=gdf1.geometry.name),axis=1)
    for gdf1_idx , gdf2_idx_serie in gdf2_coloc_idx.items():
        for gdf2_idx in  gdf2_idx_serie:
            idx1 = idx1.append(indexer1([gdf1_idx]))
            idx2 = idx2.append(indexer1([gdf2_idx]))
    return idx1,idx2
def handbuilt_featurizer(df_input):
    """
    Return a dataframe with all the handbuilt features added

    :param df_input: pandas DataFrame, the input dataframe
    :return: pandas DataFrame, the output dataframe with all the handbuilt features added
    """
    # add a progress bar wrapper around DataFrame.apply method
    tqdm.pandas(desc="Handbuilt Featurizer")
    df_handbuilt = df_input["structure_oxid"].progress_apply(
        handbuilt_featurizer_helper)
    df_with_handbuilt = pd.concat([df_input, df_handbuilt], axis=1)

    return df_with_handbuilt
示例#21
0
文件: swifter.py 项目: uitb/swifter
    def applymap(self, func):
        """
        Applymap the function to the DataFrame using swifter
        """

        # If there are no rows return early using Pandas
        if not self._nrows:
            return self._obj.applymap(func)

        sample = self._obj.iloc[:self._npartitions * 2, :]
        # check if input is string or if the user is overriding the string processing default
        allow_dask_processing = True if self._allow_dask_on_strings else (
            "object" not in sample.dtypes.values)

        try:  # try to vectorize
            with suppress_stdout_stderr():
                tmp_df = func(sample)
                sample_df = sample.applymap(func)
                self._validate_apply(
                    np.array_equal(sample_df, tmp_df) &
                    (sample_df.shape == tmp_df.shape),
                    error_message=
                    "Vectorized function sample does not match pandas apply sample.",
                )
            return func(self._obj)
        except (
                AttributeError,
                ValueError,
                TypeError,
                TypingError,
                KeyError,
        ):  # if can't vectorize, estimate time to pandas apply
            wrapped = self._wrapped_applymap(func)
            timed = timeit.timeit(wrapped, number=N_REPEATS)
            sample_proc_est = timed / N_REPEATS
            est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[
                0]

            # if pandas sample apply takes too long and not performing str processing, use dask
            if (est_apply_duration >
                    self._dask_threshold) and allow_dask_processing:
                return self._dask_applymap(func)
            else:  # use pandas
                if self._progress_bar:
                    tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                    applymap_func = self._obj.progress_applymap
                else:
                    applymap_func = self._obj.applymap

                return applymap_func(func)
示例#22
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        df = df.sort_index()
        # find the clickout rows
        clickout_rows = df[[
            'user_id', 'session_id', 'action_type', 'impressions'
        ]][df.action_type == 'clickout item']
        clickout_rows[
            'impressions_count'] = clickout_rows.impressions.str.split(
                '|').str.len()
        # prepare the resulting dataframe
        res_df = df[['user_id', 'session_id']].copy()
        res_df['impressions_count'] = 0

        # iterate over the sorted reference_rows and clickout_rows
        j = 0
        clickout_indices = clickout_rows.index.values

        ck_idx = clickout_indices[0]
        next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id']
        next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id']
        for idx, row in tqdm(res_df.iterrows()):
            # if the current index is over the last clickout, break
            if idx > clickout_indices[-1]:
                break
            # find the next clickout index
            while idx > clickout_indices[j]:
                j += 1
                ck_idx = clickout_indices[j]
                next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id']
                next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id']

            # check if row and next_clickout are in the same session
            if row.user_id == next_clickout_user_id and row.session_id == next_clickout_sess_id:
                res_df.at[idx, 'impressions_count'] = clickout_rows.at[
                    ck_idx, 'impressions_count']

        # create the 25 categories
        one_hot_counts = np.zeros((res_df.shape[0], 25), dtype=np.int8)
        for i, c in enumerate(res_df.impressions_count.values):
            one_hot_counts[i, 0:c] = 1

        for i in range(25):
            res_df['impr_c{}'.format(i)] = one_hot_counts[:, i]

        return res_df.drop(['user_id', 'session_id', 'impressions_count'],
                           axis=1)
示例#23
0
def merge_duplicates(df):
    """
    Deletes from df consecutive actions of same type performed on the same reference within the same session.
    It keeps the first occurrence of those consecutive actions and for those it saves
    how many consecutive actions are occurred in column 'frequence'.
    For the non-consecutive actions, frequence is set to 1.

    :param df: DataFrame to preprocess
    :return: df: preprocessed DataFrame df with 'frequence' column
    """
    tqdm.pandas()

    duplicates_indices = []
    # points to the next valid row
    indices = df.index.values
    totlen = len(df)
    i = 0
    j = 0
    next_index = indices[j]

    for index in tqdm(indices):
        if i >= j:
            curr_actiontype = df.at[index, 'action_type']
            count = 1
            j += 1
            # check next interactions
            while j < totlen:
                next_index = indices[j]

                # iterate while the interactions are duplicated
                if curr_actiontype != 'clickout item' and \
                    df.at[index, 'user_id'] == df.at[next_index, 'user_id'] and \
                    df.at[index, 'session_id'] == df.at[next_index, 'session_id'] and \
                    df.at[index, 'reference'] == df.at[next_index, 'reference'] and \
                    curr_actiontype == df.at[next_index, 'action_type']:

                    # current interaction can be merged
                    j += 1
                    duplicates_indices.append(next_index)
                    count += 1
                else:
                    break

            # different interaction reached
            df.at[index, 'frequence'] = count
        i += 1

    # drop the duplicated indices
    return df.drop(duplicates_indices)
示例#24
0
    def make_history_df(self, type):
        '''Create dataframe with all players' gameweek or season histories'''

        print(f'Creating player {type} dataframe')
        tqdm.pandas()

        # get histories for each player
        df = pd.Series(self.players.index).progress_apply(get_player_history,
                                                          type=type)
        # combine results into single dataframe
        df = pd.concat(p for p in df)
        # rename columns
        df.rename({'element': 'player_id'}, axis=1, inplace=True)

        return df
def load_concatenate_by_filename(needle: str, src_path="data/raw/pjud"):
    archivos = os.listdir(src_path)
    tqdm.pandas()

    dataframes = []

    for archivo in archivos:
        if archivo.find(needle) != -1:
            df = pd.read_csv(f"{src_path}/{archivo}",
                             sep=";",
                             encoding='cp850',
                             dtype='unicode',
                             low_memory=True)
            dataframes.append(df)

    return pd.concat(dataframes)
示例#26
0
 def __init__(self, pattern, backup=None, targetcol='text', lang='english', cases='lower', hashtag=True, mention=True):
     import pandas as pd
     from tqdm.auto import tqdm
     tqdm.pandas()
     import nltk
     nltk.download('stopwords', quiet=True)
     from nltk.corpus import stopwords
     self.positive, self.negative = None, None
     self.target = targetcol
     self.cases = cases
     self.backup = backup
     self.pattern = pattern
     self.hashtag = hashtag
     self.mention = mention
     self.dtm = None
     self.stopword = stopwords.words(lang)
    def extract_feature(self):
        tqdm.pandas()
        """
        Train and test cannot be concatenated because there are some sessions that
        are splitted and they have a first part in train and the last in the test.
        In those cases, the label will be only one (since they will be treated as
        one session by 'find_last_clickout_indices' function) but they must have
        2 different labels (1 for the train half, 1 for the test half)
        """

        #df = pd.concat([data.train_df(self.mode), data.test_df(self.mode)])

        def get_label(df):
            """ Return a dataframe with: index | user_id | session_id | label """
            # find the last clickout rows
            idxs = find_last_clickout_indices(df)

            res_df = df[['user_id', 'session_id', 'reference',
                         'impressions']].loc[idxs]
            # remove the test sessions with reference NaN
            res_df = res_df.dropna(subset=['reference']).astype(
                {'reference': 'int'})
            # create impressions list
            res_df['impressions_list'] = res_df['impressions'].str.split(
                '|').apply(lambda x: list(map(int, x)))
            res_df.drop('impressions', axis=1, inplace=True)

            label_series = np.zeros(res_df.shape[0], dtype='int8')
            # iterate over the rows
            k = 0
            for row in tqdm(
                    zip(res_df['reference'], res_df['impressions_list'])):
                ref = row[0]
                impress = row[1]
                if ref in impress:
                    label_series[k] = impress.index(ref)
                k += 1
            # add the new column
            res_df['label'] = label_series

            return res_df.drop(['reference', 'impressions_list'], axis=1)

        # compute the labels for train and test
        label_train = get_label(data.train_df(self.mode))
        label_test = get_label(data.test_df(self.mode))
        return pd.concat([label_train, label_test])
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        # count the popularity
        #cnt = Counter(df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)].reference.values.astype(int))
        pop_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)] \
                    [['reference','frequence']].astype('int').groupby('reference').sum()
        cnt = pop_df.to_dict()['frequence']

        # find the clickout rows
        clickout_rows = df[df.action_type == 'clickout item'][[
            'reference', 'impressions'
        ]]
        clickout_rows = clickout_rows.fillna(-1).astype({'reference': 'int'})
        clickout_rows['impressions'] = clickout_rows.apply(
            lambda x: list(map(int, x.impressions.split('|'))), axis=1)

        # build the resulting matrix
        matrix = np.zeros((clickout_rows.shape[0], 25), dtype=int)

        i = 0
        for impr in tqdm(clickout_rows.impressions):
            for j, impr in enumerate(impr):
                ## OLD version
                #popularity = cnt[impr] if impr in cnt else 0
                #if popularity == row.reference:
                #    popularity -= 1

                ## NEW ! (decrease 1 to all references)
                popularity = cnt[impr] - 1 if impr in cnt else 0
                matrix[i, j] = popularity
            i += 1

        # scale log and min-max
        min_pop = np.log((pop_df['frequence'] - 1).clip(0).min() + 1)
        max_pop = np.log((pop_df['frequence'] - 1).clip(0).max() + 1)

        matrix = (np.log(matrix + 1) - min_pop) / (max_pop - min_pop)

        # add the columns to the resulting dataframe
        for i in range(25):
            clickout_rows['impr_pop{}'.format(i)] = matrix[:, i]

        return clickout_rows.drop(['reference', 'impressions'], axis=1)
示例#29
0
def fill_skeleton(skeleton, sales):

    tqdm.pandas()

    def change_variable(row, sales, variable):
        e, w, sk = row['EAN'], row['Week'], row['StoreKey']
        try:
            vol = sales[(e, w, sk)][variable]
            return vol
        except KeyError:
            return 0

    for var in ['Volume', 'AvgPrice']:
        skeleton[var] = skeleton.progress_apply(
            lambda row: change_variable(row, sales, var), axis=1)

    return skeleton
示例#30
0
    def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds):
        sample = self._obj.iloc[: self._npartitions * 2, :]
        with suppress_stdout_stderr():
            meta = sample.apply(
                func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
            )
        try:
            if broadcast:
                result_type = "broadcast"
            elif reduce:
                result_type = "reduce"
            with suppress_stdout_stderr():
                # check that the dask apply matches the pandas apply
                tmp_df = (
                    dd.from_pandas(sample, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
                self._validate_apply(
                    tmp_df.equals(meta), error_message="Dask apply sample does not match pandas apply sample."
                )
            if self._progress_bar:
                with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
                    return (
                        dd.from_pandas(self._obj, npartitions=self._npartitions)
                        .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                        .compute(scheduler=self._scheduler)
                    )
            else:
                return (
                    dd.from_pandas(self._obj, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
        except (AttributeError, ValueError, TypeError, KeyError):
            # if dask apply doesn't match pandas apply, fallback to pandas
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                apply_func = self._obj.progress_apply
            else:
                apply_func = self._obj.apply

            return apply_func(
                func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
            )
示例#31
0
# In[ ]:


import time
import random
import pandas as pd
import numpy as np
import gc
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext
import os