def rolling_mean_by_date_by_group(data: dd = None,
                                  groupby_columns: List[str] = None,
                                  metric_columns: List[str] = None,
                                  date_column: str = None,
                                  window: int = None) -> dd:
    """
    Split input dateframe into groups and preform a rolling average on the metric columns for each group
    :param data: input dataframe
    :param groupby_columns: list of columns to group by
    :param metric_columns: columns to calculate rolling average on
    :param date_column: name of date column
    :param window: window size to be used on rolling average
    :return: modified dask dataframe
    """
    data = data.set_index(date_column, sorted=True)
    output_schema = dict(data.dtypes)
    for metric_column in metric_columns:
        output_schema[f'{metric_column}_rolling_mean'] = 'float32'
    output_schema = list(output_schema.items())
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: rolling_mean_by_date(
            data=df_g, metric_columns=metric_columns, window=window),
        meta=output_schema)
    data = data.reset_index().rename(columns={'index': date_column})
    return data
def fill_missing_dates_by_group(data: dd = None,
                                groupby_columns: List[str] = None,
                                fill_method: str = None,
                                date_range: Tuple[str] = None,
                                date_column: str = None,
                                fill_value=None) -> dd:
    """
    split input dataframe into groups according to groupby columns and reindex with continuous dates with specified 
    date range. Fill missing values according to fill method
    :param data: dataframe
    :param groupby_columns: list of columns to groupby 
    :param fill_method: method used to fill missing data
    :param date_range: date range to reidex to
    :param date_column: name of date column
    :return: modified dataframe
    """
    output_schema = dict(data.dtypes)
    output_schema = list(output_schema.items())
    columns = data.columns
    data = data.set_index(date_column, sorted=True)
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: fill_missing_dates(data=df_g,
                                        date_column=date_column,
                                        fill_method=fill_method,
                                        columns=columns,
                                        date_range=date_range,
                                        fill_value=fill_value,
                                        groupby_columns=groupby_columns),
        meta=output_schema).reset_index(drop=True)
    return data
Exemplo n.º 3
0
    def fuzzy_join(orders: dd, price_over_time: dd, on: str) -> dd:
        orders.loc[:, 'price'] = pd.to_numeric(orders['price'])
        orders.loc[:, 'time'] = pd.to_datetime(orders['time'])

        price_over_time = price_over_time.reindex(orders['time'].unique(),
                                                  method='nearest')

        joined = orders.join(price_over_time, on=on).fillna(method='ffill')

        joined['relative_price'] = joined.apply(lambda row: float(row[
            'price']) - float(row['most_recent_trade_price']),
                                                axis=1)

        return joined
Exemplo n.º 4
0
    def remove_tails(self, data: dd, std_devs: int, sample_size: int = 10000):
        data = DataUtils().keep_n_std_dev(data, std_devs)
        if len(data) > sample_size:
            data = data.sample(n=sample_size)
        data = DataUtils().keep_n_std_dev(data, std_devs)

        return data
Exemplo n.º 5
0
    def fit(self, X: dd, y=None):
        """
        Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """

        # Calculate number of missing rows in each column
        summary_df = X.isnull().sum().compute()
        summary_df = summary_df.to_frame(name="nulls_count")
        summary_df["nulls_proportions"] = summary_df["nulls_count"] / X.shape[
            0].compute()
        summary_df.sort_values(by="nulls_count", ascending=False, inplace=True)

        # Select what columns should be removed, based on proportions
        mask_nulls = summary_df["nulls_proportions"] > self.nulls_threshold
        summary_df.loc[mask_nulls, "filtered_nulls"] = 1
        summary_df.loc[~mask_nulls, "filtered_nulls"] = 0

        self.feature_names = list(summary_df[mask_nulls].index.values)

        return self
Exemplo n.º 6
0
def make_filter_std_pipeline(data: dd,
                             numerical_columns: list[str] or bool = True,
                             thresholds: list[float] = None,
                             inclusive: bool = False):
    #TODO: write unit tests
    """
    Makes pipeline to filter columns according to standard deviation

    Args:
        data (dd): Data frame to be filtered
        numerical_columns (list or bool, optional): Columns to subset the filtering. Defaults to True.
        thresholds (list, optional): Interval of std values to filter. Defaults to None.
        inclusive (bool, optional):  Includes or not the interval boundaries. Defaults to False.

    Returns:
        EPipeline: Pipeline to filter data frame
    """
    selected_columns = data.select_dtypes(
        include=[np.number]).columns.values if isinstance(
            numerical_columns, bool) else numerical_columns
    steps = [("extract", Extract(selected_columns)),
             ("std_filter",
              Filter_Std(std_thresholds=thresholds, inclusive=inclusive))]

    return EPipeline(steps)
def drop_rows_with_any_null_values(data: dd = None) -> dd:
    """
    drop and rows containing null values from the input dataframe
    :param data: dask dataframe
    :return: modified dask dataframe
    """
    return data.dropna()
Exemplo n.º 8
0
    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(exclude=[np.number, "datetime64[ns]"])

        # Calculate the entropy column-wisely
        entropies_df = subset.compute().apply(entropy,
                                              axis=0).to_frame(name="entropy")
        entropies_df.reset_index(inplace=True)
        entropies_df.rename(columns={"index": "column_name"}, inplace=True)
        entropies_df.sort_values(by="entropy", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.entropy_thresholds]
        mask_entropy = entropies_df["entropy"].between(
            min(thresholds), max(thresholds), inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(entropies_df.loc[~mask_entropy,
                                                   "column_name"].values)
        mask_removed = entropies_df["column_name"].isin(self.feature_names)
        entropies_df.loc[mask_removed, "filtered_entropy"] = 1

        return self
    def get_orderbook(feed_df: pd.DataFrame, ob_state: dd, ob_state_seq) -> dd:
        """Gets those orders which are still active at the end of the feed"""
        # Find those orders which are no longer on the book
        # TODO: find those orders which were modified, handle carefully
        open_messages = feed_df[feed_df['type'] == 'open']
        open_messages['size'] = open_messages['remaining_size']
        residual_orders = open_messages[
            open_messages['sequence'] > ob_state_seq]
        all_orders = ob_state.append(residual_orders)

        done_messages = feed_df[feed_df['type'] == 'done']
        done_order_ids = list(done_messages['order_id'])

        # Find those orders which are still on the book
        ob_filtered = all_orders[~all_orders['order_id'].isin(done_order_ids)]

        # This variable is used in the pandas query below
        # final_trade_price = trades['price'].dropna().iloc[-1]

        # ob_final = DataSplitter.get_side("buy", ob_filtered).query('price < @final_trade_price').append(
        #     DataSplitter.get_side("sell", ob_filtered).query('price > @final_trade_price')
        # )

        if not OrderBookCreator.check_ob_valid(ob_filtered):
            raise AssertionError("OrderBook does not appear to be valid")

        final_seq = ob_filtered['sequence'].sort_values().iloc[-1]

        return ob_filtered.reset_index(drop=True)[[
            'side', 'order_id', 'price', 'size'
        ]], final_seq
Exemplo n.º 10
0
def encode_dataset_into_binning_indices(dd: dict, data: df, bn_attrs: [],
                                        cat_attrs: []):
    """Before constructing Bayesian network, encode input dataset into binning indices."""
    data_enconded = data.to_delayed()
    data_enconded = [
        dask.delayed(attributes.encode_chunk_into_binning_indices)(
            chunk, bn_attrs, cat_attrs, dd['distribution']['bins'])
        for chunk in data_enconded
    ]
    return data_enconded
def agg_insert_by_group(data: dd = None,
                        groupby_columns: List[str] = None,
                        agg_dict: dict = None,
                        insert_dict: dict = None) -> dd:
    """
    Split input dataframe into groups, apply aggregations on each group according to the aggregation dict, 
    insert aggregated results back into the original dataframe with column values specified in insert dict
    :param data: input dask dataframe
    :param groupby_columns: list of column names to group by
    :param agg_dict: dictionary of the format {column name: aggregation to preform to column name}
    :param insert_dict: dictionary of the format {column name: value of column to be set prior to insertion}
    :return: modified datafraeme
    """
    agg_data = data.groupby(groupby_columns).agg(agg_dict).reset_index()
    agg_data.columns = agg_data.columns.droplevel(1)
    for column, value in insert_dict.items():
        agg_data[column] = 'COMBINED'
    data = data.append(agg_data)
    return data
Exemplo n.º 12
0
def aggr_by_year_journal(df: dask.dataframe) -> dask.dataframe:
    """Aggregate issue count by year and newspaper.

    :param dask.dataframe df: Dataframe comprising all issues.
    :return: Dataframe grouped by year and source .
    :rtype: dask.dataframe

    """

    return df.groupby(['journal', 'year']).count()
def drop_duplicate_rows(data: dd = None,
                        subset: List[str] = None,
                        keep: str = None) -> dd:
    """
    Drop rows containing duplicate data for the specified subset of columns
    :param data: dask dataframe
    :param subset: list of column names
    :param keep: which duplicate to keep
    :return: modified dask dataframe
    """
    return data.drop_duplicates(subset=subset, keep=keep)
Exemplo n.º 14
0
    def transform(self, X: dd, y=None):
        """
        Remove duplicated rows

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            (dd): Dataframe with rows removed
        """
        return X.drop_duplicates(subset=self.subset)
Exemplo n.º 15
0
    def transform(self, X: dd, y=None):
        """
        Remove columns computed in fit method

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            (dd): Dataframe with columns removed
        """
        return X.drop(labels=self.feature_names, axis=1)
Exemplo n.º 16
0
    def remove_outliers(self, data: dataframe, threshold: float):

        data = data.compute(num_workers=self.workers)
        stats: dict = {
            "mean": data[self.cols["CONTINUOUS"]].mean(axis=0),
            "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0)
        }

        z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"]))
        zdata = data[self.cols["CONTINUOUS"]].apply(
            lambda col: (col - stats["mean"][col.name]) /
            (stats["std_dev"][col.name]),
            axis=0)
        zdata.columns = z_cols

        data = concat([data, zdata], axis=1)
        for z_col in z_cols:
            data = data[data[z_col].between(-1 * threshold, threshold)]

        return dataframe.from_pandas(
            data.drop(columns=z_cols).reset_index(drop=True),
            npartitions=self.workers)
def yoy_percent_change_by_group(data: dd = None,
                                groupby_columns: List[str] = None,
                                metric_columns: List[str] = None,
                                date_column: str = None) -> dd:
    """
    Split dataframe into groups and calculate year over year percent change for the etric columns in each group
    :param data: input dataframe
    :param groupby_columns: list of columns to group by
    :param metric_columns: columns to calculate rolling average on
    :param date_column: name of date column
    :return: modified dataframe
    """
    data = data.set_index(date_column, sorted=True)
    output_schema = dict(data.dtypes)
    for metric_column in metric_columns:
        output_schema[f'{metric_column}_yoy_pct_change'] = 'float32'
    output_schema = list(output_schema.items())
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: yoy_percent_change(data=df_g,
                                        metric_columns=metric_columns),
        meta=output_schema)
    data = data.reset_index().rename(columns={'index': date_column})
    return data
Exemplo n.º 18
0
def make_filter_entropy_pipeline(data: dd,
                                 categorical_columns: list[str] or bool = True,
                                 thresholds: list[float] = None,
                                 inclusive: bool = False):
    #TODO: write unit tests
    selected_columns = data.select_dtypes(
        exclude=[np.number], include=["object"]) if isinstance(
            categorical_columns, bool) else categorical_columns
    steps = [("extract", Extract(selected_columns)),
             ("entropy_filter",
              Filter_Entropy(entropy_thresholds=thresholds,
                             inclusive=inclusive))]

    return EPipeline(steps)
Exemplo n.º 19
0
def date_continuity_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_column: str = None) -> bool:
    """
    Split data into groups and evaluate each group checking if it contains a set of continuous dates in its date column.
    If any group contains a discontinuity return true else return false
    :param data: dask dataframe
    :param groupby_columns: column names to groupby
    :param date_column: date column name
    :return: boolean
    """
    output_schema = [(date_column, data[date_column].dtype)]
    output_schema.append(('date_continuity_bool', 'bool'))
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: date_continuity_check(data=df_g, date_column=date_column),
        meta=output_schema).reset_index()
    return data['date_continuity_bool'].compute().any()
Exemplo n.º 20
0
def date_range_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_range: Tuple[str] = None,
                              date_column: str = None) -> bool:
    """
    Split input dataframe by group and check if the min and max date of each group falls outside of specified date range
    :param data: dask dataframe
    :param groupby_columns: list of column names to group by
    :param date_range: tuple defining required date range
    :param date_column: name of date column
    :return: bool
    """
    output_schema = ('date_range_bool', 'bool')
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: date_range_check(data=df_g, date_range=date_range, date_column=date_column),
        meta=output_schema).reset_index()
    return data['date_range_bool'].compute().any()
Exemplo n.º 21
0
    def impute_nulls(self, data: dataframe):
        # Impute by mean
        data[self.cols["CONTINUOUS"]] = data[self.cols["CONTINUOUS"]].fillna(
            data[self.cols["CONTINUOUS"]].mean(
                axis=0, skipna=True).compute(num_workers=self.workers),
            axis=0)

        # Impute by mode
        cat_cols: list = self.cols["CATEGORICAL"]["STRING"] +\
            self.cols["CATEGORICAL"]["NUMERIC"]
        col_modes = data[cat_cols].mode(dropna=True).compute(
            num_workers=self.workers)
        for col in cat_cols:
            data[col] = data[col].fillna(col_modes[col].iloc[0], axis=0)

        data = data.dropna(how="any")
        return data
Exemplo n.º 22
0
def write_data_by_file_extension(data: dd = None, file_path: Path = None):
    """
    write dask dataframe to file into based on the input path file extension
    :param file_path: path of output file
    :return: None
    """
    data = data.compute()
    map_file_extension_to_read_function = {
        '.csv': 'to_csv',
        '.parquet': 'to_parquet'
    }
    name, extension = os.path.splitext(file_path)
    if extension.lower() in map_file_extension_to_read_function.keys():
        write_function = getattr(
            data, map_file_extension_to_read_function[extension.lower()])
        read_function = map_file_extension_to_read_function[extension.lower()]
        write_function(file_path, index=False)
    else:
        raise Exception(f"File extention {extension} not recognized")
def fill_missing_dates(data: dd = None,
                       date_column: str = None,
                       fill_method: str = None,
                       columns=None,
                       date_range: Tuple[str] = None,
                       fill_value=None,
                       groupby_columns=None) -> dd:
    """
    Preform date fill on single group
    """
    all_dates = pd.date_range(date_range[0], date_range[1])
    metric_data = data[[
        col for col in data.columns if col not in groupby_columns
    ]]
    data = data[groupby_columns].reindex(all_dates, method='nearest')
    metric_data = metric_data.reindex(all_dates,
                                      method=fill_method,
                                      fill_value=fill_value)
    data = dd.merge(data, metric_data, left_index=True, right_index=True)
    data = data.reset_index().rename(columns={'index': date_column})[columns]
    return data
def upload_parquet_file_to_es_idx(es: Elasticsearch, es_idx: str,
                                  papers_dd: dd, partition_num: int) -> None:
    try:
        start = time.time()
        papers_dd_partition = papers_dd.get_partition(partition_num)
        papers_df_partition = papers_dd_partition.compute()
        compute_end = time.time()
        print(
            f"Partition #{partition_num} compute time: {compute_end - start}")
        print(
            f"Papers partition memory size: {papers_df_partition.memory_usage(deep=True).sum()}"
        )
        print(f"Number of papers in partition: {papers_df_partition.shape[0]}")
        r = es.bulk(rec_to_actions(papers_df_partition, es_idx))
        upload_end = time.time()
        print(
            f"Partition #{partition_num} upload time: {upload_end - compute_end}"
        )
        print(
            f"Errors in uploading partition #{partition_num}: {r['errors']}\n\n"
        )
    except TransportError as te:
        transport_error_413_url = "https://github.com/elastic/elasticsearch/issues/2902"
        transport_error_429_urls = [
            "https://stackoverflow.com/questions/61870751/circuit-breaking-exception-parent-data-too-large-data-for-http-request",
            "https://github.com/elastic/elasticsearch/issues/31197",
        ]
        if te.status_code == 413:
            print(
                f"Transport error with status code 413. Chunk size is too large, so try reducing chunk size constant or increase http.max_content_length in the yml file. More info here: {transport_error_413_url}"
            )
        elif te.status_code == 429:
            print(
                f"Transport error with status code 429. Elasticsearch's JVM heap size is too small, so try increasing ES_HEAP_SIZE env var in docker-compose.yml. More info here: {transport_error_429_urls}"
            )
        else:
            # Could be ConnectionTimeout in connecting to index
            print(f"Error stacktrace: {te.error, te.info}")
        raise te
Exemplo n.º 25
0
    def apply(self, df: dd, scheduler: Scheduler = "processes") -> np.ndarray:
        """Label Dask DataFrame of data points with LFs.

        Parameters
        ----------
        df
            Dask DataFrame containing data points to be labeled by LFs
        scheduler
            A Dask scheduling configuration: either a string option or
            a ``Client``. For more information, see
            https://docs.dask.org/en/stable/scheduling.html#

        Returns
        -------
        np.ndarray
            Matrix of labels emitted by LFs
        """
        apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs)
        map_fn = df.map_partitions(lambda p_df: p_df.apply(apply_fn, axis=1))
        labels = map_fn.compute(scheduler=scheduler)
        labels_with_index = rows_to_triplets(labels)
        return self._numpy_from_row_data(labels_with_index)
Exemplo n.º 26
0
    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(include=[np.number])

        # Calculate the standad deviation column-wisely
        stds = np.nanstd(subset, axis=0)

        stds_df = pd.DataFrame.from_dict({
            "column_name": subset.columns.values,
            "std": stds
        })

        stds_df.sort_values(by="std", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.std_thresholds]
        mask_variance = stds_df["std"].between(min(thresholds),
                                               max(thresholds),
                                               inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(stds_df.loc[~mask_variance,
                                              "column_name"].values)
        mask_removed = stds_df["column_name"].isin(self.feature_names)

        stds_df.loc[mask_removed, "filtered_variance"] = 1
        stds_df.loc[~mask_removed, "filtered_variance"] = 0

        return self
def fill_in_missing_data(dask_df: dd) -> None:
    return dask_df.fillna("")
def remove_papers_with_null_cols(dask_df: dd, cols: List[str]) -> None:
    return dask_df.dropna(subset=cols, how="all")
def gather_papers_data(metadata_dd: dd) -> dd:
    return metadata_dd.map_partitions(lambda df: df.assign(
        body=retrieve_paper_body_text_for_series(df.pdf_json_files)))
def preprocess_dataset(ddf: dataframe) -> dataframe:
    """Preprocesses a dataFrame:
        - constant missing value replacement
        - lower case
        - strip accentuated characters
        - extract year from title and simplifies it to avoid redundancy
        - Stop words removal and stemming

    Parameters
    ----------
    ddf: str
        the dataframe to be processed.

    Returns
    -------
    dataframe
    """

    text_cols = [
        'country', 'designation', 'province', 'region_1', 'region_2',
        'taster_name', 'variety', 'winery'
    ]

    ddf = ddf.map_partitions(lambda d: d.assign(country=d['country'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(designation=d[
        'designation'].fillna("_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(province=d['province'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(region_1=d['region_1'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(region_2=d['region_2'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(taster_name=d[
        'taster_name'].fillna("_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(variety=d['variety'].fillna(
        "_missing_").str.lower().apply(unidecode)))
    ddf = ddf.map_partitions(lambda d: d.assign(winery=d['winery'].fillna(
        "_missing_").str.lower().apply(unidecode)))

    # Get year from the title
    ddf = ddf.map_partitions(lambda d: d.assign(year=d['title'].str.extract(
        '(\d{4,})', expand=False).astype(float)))

    # Remove year and geographical info from the tilte. They are in already other columns.
    ddf = ddf.map_partitions(lambda d: d.assign(title=d[
        'title'].fillna("_missing_").str.lower().apply(unidecode).str.replace(
            '(\d+ )', '').str.replace('\((.+)\)\s*$', '').str.replace(
                '\s{2,}', ' ').fillna("_missing_")))

    ddf = ddf.map_partitions(lambda d: d.assign(description=d[
        'description'].fillna("_missing_").str.lower().apply(stem_description).
                                                fillna("_missing_")))

    return ddf