Пример #1
0
def generate_row_hash(d: DataFrame, hash_only=False, date=None) -> DataFrame:
    """

    Parameters
    ----------
    d
    hash_only
    date

    Returns
    -------

    """
    hash_cols = [
        "date",
        "area_type",
        "area_code",
        "metric_id",
        "release_id"
    ]

    try:
        d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d"))
    except AttributeError:
        pass

    d.date = d.date.map(lambda x: x[:10])

    # Create hash
    hash_key = (
        d
        .loc[:, hash_cols]
        .astype(str)
        .sum(axis=1)
        .apply(str.encode)
        .apply(lambda x: blake2s(x, key=RECORD_KEY, digest_size=12).hexdigest())
    )

    if hash_only:
        return hash_key

    column_names = d.columns

    data = d.assign(
        hash=hash_key,
        seriesDate=date,
        id=hash_key
    ).loc[:, ['id', 'hash', 'seriesDate', *list(column_names)]]

    return data
Пример #2
0
def compute_distribution(data):
    pre_outstanding = 0
    open_price = data.at[0, 'open']
    data = data[['date', 'volume', 'aprice', 'outstanding']]
    data.date = data.date.str.encode("UTF-8")
    np_data = data.values
    tmp_arrary = np.zeros((2, 6), dtype=DTYPE_LIST)
    data_arrary = np.zeros((2, 6), dtype=DTYPE_LIST)
    for index, row in enumerate(np_data):
        cdate, volume, aprice, outstanding = row[[0, 1, 2, 3]]
        if 0 == index:
            t1 = (index, cdate, cdate, aprice, volume, outstanding)
            t2 = (index, cdate, cdate, open_price, outstanding - volume,
                  outstanding)
            t = np.array([t1, t2], dtype=DTYPE_LIST)
            tmp_arrary = t.copy()
        else:
            tmp_arrary = adjust_volume(tmp_arrary, index, volume, aprice,
                                       pre_outstanding, outstanding)
            tmp_arrary['date'] = cdate
            tmp_arrary['outstanding'] = outstanding
            tdata = (index, cdate, cdate, aprice, volume, outstanding)
            t = np.array([tdata], dtype=DTYPE_LIST)
            tmp_arrary = np.concatenate((tmp_arrary, np.array(t)), axis=0)
        pre_outstanding = outstanding
        tmp_arrary = tmp_arrary[tmp_arrary['volume'] > 0]
        data_arrary = tmp_arrary.copy() if 0 == index else np.concatenate(
            (data_arrary, tmp_arrary), axis=0)
    df = DataFrame(data=data_arrary, columns=CHIP_COLUMNS)
    df.date = df.date.str.decode('utf-8')
    df.sdate = df.sdate.str.decode('utf-8')
    df.price = df.price.astype(float).round(2)
    return df
def prep_activity_data(df: pd.DataFrame) -> pd.DataFrame:
    log('Prepping activity data frame')
    clean_column_names(df)
    df.date = pd.to_datetime(df.date)

    # Columns that should actually be numbers, but that could have commas in
    # them, so pandas treats them as objects (strings)
    number_cols = [
        'calories_burned', 'steps', 'minutes_sedentary',
        'minutes_lightly_active', 'minutes_fairly_active',
        'minutes_very_active', 'activity_calories'
    ]
    # We'll `select_dtypes` here so that if a column is already numeric, we
    # won't try to re-process it.
    for col in df[number_cols].select_dtypes('object'):
        df[col] = handle_commas(df[col])

    # We'll do a little bit of feature engineering here and combine the seperate
    # active minutes columns into a single column reprepseting overall active
    # minutes.
    df['minutes_active'] = (df.minutes_lightly_active +
                            df.minutes_fairly_active + df.minutes_very_active)
    df.drop([
        'minutes_lightly_active', 'minutes_fairly_active',
        'minutes_very_active'
    ],
            axis=1,
            inplace=True)

    df['month'] = df.date.dt.strftime('%m-%b')
    df['weekday'] = df.date.dt.day_name().str[:3]

    return df.sort_values(by='date')
Пример #4
0
def finance_report(start: Timestamp,
                   end: Timestamp,
                   market: str,
                   symbol: str,
                   report_type: str,
                   quarter="all") -> DataFrame:
    """

    :param start: start time
    :param end: end time
    :param market: {'HK', 'CN'}
    :param symbol: stock symbol
    :param report_type: {'indicator', 'balance', 'income', 'business'}
    :param quarter: {'all', 'Q1', 'Q2', ‘Q3', 'Q4'}
    :return: data frame contains items of financial report
    """
    count = (end.to_period(freq='Q') - start.to_period(freq='Q')).n
    end_timestamp = int(end.timestamp() * 1000)
    urlpath = f"{market}/{report_type}.json?symbol={symbol}&&type={quarter}" \
              f"&is_detail=true&count={count}&timestamp={end_timestamp}"
    url = urljoin(api_ref.finance_base, urlpath)
    data = utls.fetch(url)
    data_list = data.pop('list')
    for d in data_list:
        for k in d:
            if isinstance(d[k], list):
                d[k] = d[k][0]
    df = DataFrame(data_list).drop(columns=['ctime']).rename(
        columns={
            'report_date': 'date'
        }).set_index('date')
    df.date = df.date.astype('M8[ms]')
    df.report_name = df.report_name.str.replace('年报', 'Q4').str.replace('三季报', 'Q3')\
        .str.replace('中报', 'Q2').str.replace('一季报', 'Q1')
    return df
Пример #5
0
 def process_df(df: pd.DataFrame, basin_indexes: List[Tuple], loop_idx: int) -> pd.DataFrame:
     """Take in a dataframe and process it before converting to array."""
     df.rename(columns={"discharge_spec": "QObs(mm/d)"}, inplace=True)
     df.date = pd.to_datetime(df.date, dayfirst=True, format="%Y-%m-%d")
     # Iterate through each category of static basin attributes and add the ones in the config yaml as a column.
     for key in DATASET_KEYS[1:]:
         # Check if any of the features requested are actually in this category, doing this gives large speedup.
         if len(self.features[key]) > 0:
             filename = f'CAMELS_GB_{key}_attributes.csv'
             attr_df: pd.DataFrame = pd.read_csv(os.path.join(self.data_dir, filename),
                                                 usecols=['gauge_id'] + list(self.features[key]),
                                                 index_col='gauge_id')
             for name, row in attr_df.loc[basin][self.features[key]].iteritems():
                 if name == 'dom_land_cover':
                     # Label encoding is needed for only this attribute (in the landcover data).
                     dom_land_cover_dict = {"Grass and Pasture": 0, "Shrubs": 1, "Crops": 2, "Urban": 3,
                                         "Deciduous Woodland": 4, "Evergreen Woodland": 5}
                     row = dom_land_cover_dict[row]
                 df[name] = row
     # Crop the date range as much as possible.
     if len(self.dates) == 0 and self.train:
         self.dates = [df.date[0], self.train_test_split]
     elif len(self.dates) == 0 and not self.train:
         self.dates = [self.train_test_split, df.date.iloc[-1]]
     df = self._crop_dates(df, start_date=self.dates[0], end_date=self.dates[1])
     # Remove as many contiguous regions of NaNs as possible.
     df = self._remove_nan_regions(df)
     # basin_indexes is a list of tuples containing the start and end indexes for each basin,
     # in the form (start_idx, end_idx).
     if loop_idx == 0:
         basin_indexes.append((0, len(df)))
     else:
         basin_indexes.append((basin_indexes[-1][1], basin_indexes[-1][1] + len(df)))
     return df
Пример #6
0
 def from_dataframe(cls, df: pd.DataFrame) -> CommitDataFrame:
     if (len(df) == 0):
         return cls.DF_NULL
     df.date = pd.to_datetime(df.date)
     df.set_index("date", inplace=True)
     df.sort_index()
     return cls.up(df)
Пример #7
0
def test_deferred_with_groupby():

    # GH 12486
    # support deferred resample ops with groupby
    data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
            ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
            ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
            ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
            ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]

    df = DataFrame(data, columns=['date', 'id', 'score'])
    df.date = pd.to_datetime(df.date)

    def f(x):
        return x.set_index('date').resample('D').asfreq()

    expected = df.groupby('id').apply(f)
    result = df.set_index('date').groupby('id').resample('D').asfreq()
    assert_frame_equal(result, expected)

    df = DataFrame({
        'date':
        pd.date_range(start='2016-01-01', periods=4, freq='W'),
        'group': [1, 1, 2, 2],
        'val': [5, 6, 7, 8]
    }).set_index('date')

    def f(x):
        return x.resample('1D').ffill()

    expected = df.groupby('group').apply(f)
    result = df.groupby('group').resample('1D').ffill()
    assert_frame_equal(result, expected)
def test_deferred_with_groupby():

    # GH 12486
    # support deferred resample ops with groupby
    data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
            ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
            ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
            ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
            ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]

    df = DataFrame(data, columns=['date', 'id', 'score'])
    df.date = pd.to_datetime(df.date)

    def f(x):
        return x.set_index('date').resample('D').asfreq()
    expected = df.groupby('id').apply(f)
    result = df.set_index('date').groupby('id').resample('D').asfreq()
    assert_frame_equal(result, expected)

    df = DataFrame({'date': pd.date_range(start='2016-01-01',
                                          periods=4,
                                          freq='W'),
                    'group': [1, 1, 2, 2],
                    'val': [5, 6, 7, 8]}).set_index('date')

    def f(x):
        return x.resample('1D').ffill()
    expected = df.groupby('group').apply(f)
    result = df.groupby('group').resample('1D').ffill()
    assert_frame_equal(result, expected)
Пример #9
0
 def long_per_task(self):
     # a = self.b.get_chart_data(start_dt=datetime.now() - timedelta(days=4))
     a = self.b.get_chart_data(self.symbol)
     df = DataFrame(a)
     df.date = df.date.apply(datetime.fromtimestamp)
     self.get_degree(df, self.period_length, self.degree_level)
     self.get_targets(df, self.period_length, self.trace_level, self.target_rm_length)
     nop()
Пример #10
0
def normalise_records(d: DataFrame,
                      zero_filled: Iterable[str] = tuple(),
                      cumulative: Iterable[str] = tuple(),
                      reset_index: bool = False) -> DataFrame:
    """

    Parameters
    ----------
    d
    zero_filled
    cumulative
    reset_index

    Returns
    -------

    """
    zero_filled = set(zero_filled).intersection(d.columns)
    cumulative = set(cumulative).intersection(d.columns)

    if not reset_index:
        d.sort_values(["areaType", "areaCode", "date"], inplace=True)
    else:
        d = (d.reset_index().sort_values(["areaType", "areaCode", "date"]))

    for col in zero_filled:
        for areaCode in unique(d.areaCode):
            dm = d.loc[d.areaCode == areaCode, [col, 'date']]
            indices = ((d.areaCode == areaCode) &
                       (d.date < dm.dropna(axis=0).date.max()) &
                       (d.date >= dm.dropna(axis=0).date.min()))
            d.loc[indices, col] = d.loc[indices, col].fillna(0)

    # Area names are scattered around - we cannot use
    # normal `fillna` to fill them.
    if "areaName" in d.columns:
        for areaCode in unique(d.areaCode):
            area_name = unique(d.loc[d.areaCode == areaCode,
                                     "areaName"].dropna().values)[0]
            d.loc[d.areaCode == areaCode, 'areaName'] = area_name

    for col in cumulative:
        for areaCode in unique(d.areaCode):
            dm = d.loc[d.areaCode == areaCode, [col, 'date']]
            indices = ((d.areaCode == areaCode) &
                       (d.date < dm.dropna(axis=0).date.max()) &
                       (d.date >= dm.dropna(axis=0).date.min()))

            d.loc[indices, col] = d.loc[indices, col].fillna(method="ffill")

    d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d"))

    if "areaName" in d.columns:
        d = d.assign(areaNameLower=d.areaName.str.lower())

    return d
Пример #11
0
def normalise_demographics_records(
    d: DataFrame,
    zero_filled: Iterable[str] = tuple(),
    cumulative: Iterable[str] = tuple()
) -> DataFrame:
    """

    Parameters
    ----------
    d
    zero_filled
    cumulative

    Returns
    -------

    """
    zero_filled = set(zero_filled).intersection(d.columns)
    cumulative = set(cumulative).intersection(d.columns)

    d = d.reset_index().sort_values(["areaType", "areaCode", "date", "age"])

    d.loc[:, zero_filled] = (d.loc[:, zero_filled].where(
        d.loc[:, zero_filled].notnull(), 0))

    # Area names are scattered around - we cannot use
    # normal `fillna` to fill them.
    if "areaName" in d.columns:
        for areaCode in d.areaCode.dropna().unique():
            area_name = unique(d.loc[d.areaCode == areaCode,
                                     "areaName"].dropna().values)[0]
            d.loc[d.areaCode == areaCode, 'areaName'] = area_name

    # All cumulative metrics should have the same starting
    # point across different age bands.
    d.loc[d.date == d.date.min(),
          cumulative] = (d.loc[d.date == d.date.min(), cumulative].where(
              d.loc[d.date == d.date.min(), cumulative].notnull(), 0))

    for col, areaCode, age in product(cumulative, d.areaCode.unique(),
                                      d.age.unique()):
        dm = d.loc[((d.areaCode == areaCode) & (d.age == age)), [col, 'date']]

        indices = ((d.areaCode == areaCode) & (d.age == age) &
                   (d.date < dm.dropna(axis=0).date.max()) &
                   (d.date >= dm.dropna(axis=0).date.min()))

        d.loc[indices, col] = d.loc[indices, col].fillna(method="ffill")

    d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d"))

    if "areaName" in d.columns:
        d = d.assign(areaNameLower=d.areaName.str.lower())

    return d
Пример #12
0
def add_columns(df: pd.DataFrame):

    # Format date.
    df.date = pd.to_datetime(df.date, format='%Y%m%d')

    # Set the date as the DataFrame's index.
    df = df.set_index('date')

    # Add date-derived columns.
    df['date'] = df.index.date
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['week'] = df.index.week
    df['dow'] = df.index.day_name()
    df['dowIndex'] = df.index.dayofweek

    # Add group-summarization columns.
    df_weekly = df.groupby('week', as_index=False)['posIncrease'].agg(
        {
            'weeklyPosIncrease': 'sum',
            'meanWeeklyPosIncrease': 'mean',
            'stdWeeklyPosIncrease': 'std',
        },
    )
    df = pd.merge(
        df, df_weekly,
        how='left', on='week',
    )
    df['pctWeeklyPosIncrease'] = percent(df.posIncrease, df.weeklyPosIncrease)
    df['zscoreWeeklyPosIncrease'] = zScore(
        df.posIncrease,
        df.meanWeeklyPosIncrease,
        df.stdWeeklyPosIncrease,
    )

    # Add delta columns.
    df['day1LagDelta'] = lag_delta(df.posIncrease, 1)
    df['day1LeadDelta'] = lead_delta(df.posIncrease, 1)

    # Add local extrema columns.
    df['localMaximum'] = df.apply(local_max, axis=1)
    df['localMinimum'] = df.apply(local_min, axis=1)

    # Save a copy of the processed data.
    df.to_csv(
        f'{DATA_DIR}/02_intermediate/{DAILY}_add_columns.csv',
        index=True,
    )

    # Debug data frame.
    DEBUG and preview(df, add_columns.__name__)

    # Return data frame for reuse.
    return df
def homogenise_demographics_dates(d: DataFrame):
    """

    Parameters
    ----------
    d

    Returns
    -------

    """
    d.date = to_datetime(d.date, format="%Y-%m-%d")

    col_names = d.columns

    date = date_range(
        start=to_datetime(d.date).min(),
        end=to_datetime(d.date).max()
    )

    dt_time_list = list()

    age = d.age.unique()

    for area_type in unique(d.areaType):
        values = product(
            [area_type],
            unique(d.loc[d.areaType == area_type, "areaCode"]),
            date,
            age
        )

        d_date = DataFrame(
            columns=["value"],
            index=MultiIndex.from_tuples(
                tuples=list(values),
                names=["areaType", "areaCode", "date", "age"]
            )
        )
        dt_time_list.append(d_date)

    dt_time = concat(dt_time_list)
    dt_time.reset_index(inplace=True)

    d = d.merge(dt_time, how='outer', on=['areaType', 'areaCode', 'date', 'age'])

    d.sort_values(
        ["date", "areaType", "areaCode", "age"],
        ascending=[True, True, False, True],
        inplace=True
    )

    return d.loc[:, col_names]
Пример #14
0
def base_floating_profit(df, mdate=None):
    s_index = 0
    np_data = df.to_records(index=False)
    np_data = np_data.astype(DTYPE_LIST)
    index_array = np.arange(len(np_data))
    ppchange_array = np.zeros(len(np_data), dtype=float)
    if mdate is None:
        get_breakup_data(np_data)
        break_index_lists = np.where(np_data['breakup'] != 0)[0]
        effective_breakup_index_list = get_effective_breakup_index(
            break_index_lists, np_data)
        np_data['pday'] = 1
        np_data['base'] = np_data['close'].copy()
        if len(effective_breakup_index_list) == 0:
            np_data['profit'] = (np_data['close'] -
                                 np_data['uprice']) / np_data['uprice']
        else:
            for e_index in effective_breakup_index_list:
                if s_index == e_index:
                    if len(effective_breakup_index_list) == 1:
                        base = np_data['uprice'][s_index]
                        direction = np_data['breakup'][s_index]
                        ppchange = 1.1 if direction > 0 else 0.9
                        np_data['base'][s_index] = base
                        ppchange_array[s_index:] = ppchange
                        np_data['pday'][s_index:] = direction * (
                            index_array[s_index:] - s_index + 1)
                else:
                    base = np_data['uprice'][s_index]
                    direction = np_data['breakup'][e_index]
                    ppchange = 1.1 if direction < 0 else 0.9
                    np_data['base'][s_index:e_index] = base
                    ppchange_array[s_index:e_index] = ppchange
                    np_data['pday'][s_index:e_index] = -1 * direction * (
                        index_array[s_index:e_index] - s_index + 1)
                    s_index = e_index
                    if e_index == effective_breakup_index_list[-1]:
                        base = np_data['uprice'][e_index]
                        direction = np_data['breakup'][e_index]
                        ppchange = 1.1 if direction > 0 else 0.9
                        np_data['base'][e_index:] = base
                        ppchange_array[e_index:] = ppchange
                        np_data['pday'][e_index:] = direction * (
                            index_array[e_index:] - e_index + 1)
            np_data['profit'] = abs(
                np.log(np_data['close']) -
                np.log(np_data['base'])) / np.log(ppchange_array)
    df = DataFrame(data=np_data, columns=DATA_COLUMS)
    df.date = df.date.str.decode('utf-8')
    return df
Пример #15
0
 def _sanitize(df: pd.DataFrame) -> pd.DataFrame:
     """
     Method used to sanitize the dataframe. It should normalize the dataframe data
     In this example I just normalize the date format (It's true that I can do this in the read method)
     but we can imagine an index validation/transformation or invalid characters handling.
     All those manipulation depends on the need.
     :param df: Data frame to sanitize
     :return: Sanitized Dataframe
     """
     if "date" in df.columns:
         df.date = pd.to_datetime(df.date,
                                  infer_datetime_format=True,
                                  cache=True).dt.strftime("%Y-%m-%d")
     return df
Пример #16
0
def _parse_boro(data: DataFrame, column_prefix: str, fips: str) -> DataFrame:
    data = table_rename(
        data,
        {
            "DATE_OF_INTEREST": "date",
            f"{column_prefix}_CASE_COUNT": "new_confirmed",
            f"{column_prefix}_HOSPITALIZED_COUNT": "new_hospitalized",
            f"{column_prefix}_DEATH_COUNT": "new_deceased",
        },
        drop=True,
    )
    data.date = data.date.apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))
    data["key"] = f"US_NY_{fips}"
    return data
Пример #17
0
    def process_db_results(self, results: DataFrame) -> bytes:
        try:
            results.date = results.date.map(lambda x: f"{x:%Y-%m-%d}")
        except ValueError:
            pass

        res = (results.rename(
            columns={
                "areaType": "area_type",
                "areaName": "area_name",
                "areaCode": "area_code"
            }).to_json(orient="records").encode())

        return res
Пример #18
0
def apply_charting_to_df(
    df: pd.DataFrame, chart_period: str, start_time: str, stop_time: str
):
    """Modifies the dataframe based on the chart_period, start dates and end dates
    Parameters
    ----------
        df: dataframe with data loaded
        chart_period: string, describes how often to sample data, default is '1Min' (1 minute)
            see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
        start_time: datestring in YYYY-MM-DD HH:MM (ex. 2020-08-31 04:00) of when to begin the backtest
        stop_time: datestring of YYYY-MM-DD HH:MM when to stop the backtest
    Returns
        DataFrame, a sorted dataframe ready for consumption by run_backtest
    """
    if df.index.dtype != "datetime64[ns]":
        headers = df.columns.values.tolist()
        headers.extend([df.index.name])
        if "date" not in headers:
            raise Exception(
                "Data does not have a date column. Headers must include date, open, high, low, close, volume."
            )

        time_unit = detect_time_unit(df.date[1])
        df.date = pd.to_datetime(df.date, unit=time_unit)
        df.set_index("date", inplace=True)
    if start_time:
        if isinstance(start_time, datetime) or type(start_time) is int:
            time_unit = detect_time_unit(start_time)
            start_time = pd.to_datetime(start_time, unit=time_unit)
            start_time = start_time.strftime("%Y-%m-%d %H:%M:%S")

    if stop_time:
        if isinstance(stop_time, datetime) or type(stop_time) is int:
            time_unit = detect_time_unit(stop_time)
            stop_time = pd.to_datetime(stop_time, unit=time_unit)
            stop_time = stop_time.strftime("%Y-%m-%d %H:%M:%S")

    df = df.resample(chart_period).first()

    if start_time and stop_time:
        df = df[start_time:stop_time]  # noqa
    elif start_time and not stop_time:
        df = df[start_time:]  # noqa
    elif not start_time and stop_time:
        df = df[:stop_time]

    return df
Пример #19
0
def compute_oneday_distribution(pre_date_dist, cdate, pos, volume, aprice,
                                pre_outstanding, outstanding):
    np_pre_data = pre_date_dist.to_records(index=False)
    np_pre_data = np_pre_data.astype(DTYPE_LIST)
    np_pre_data = adjust_volume(np_pre_data, pos, volume, aprice,
                                pre_outstanding, outstanding)
    np_pre_data['date'] = cdate
    np_pre_data['outstanding'] = outstanding
    np_pre_data = np.concatenate(
        (np_pre_data,
         np.array([(pos, cdate, cdate, aprice, volume, outstanding)],
                  dtype=DTYPE_LIST)),
        axis=0)
    df = DataFrame(data=np_pre_data, columns=CHIP_COLUMNS)
    df = df[df.volume != 0]
    df.date = df.date.str.decode('utf-8')
    df.sdate = df.sdate.str.decode('utf-8')
    df.price = df.price.astype(float).round(2)
    return df.reset_index(drop=True)
Пример #20
0
def magic_predict(data: pd.DataFrame,
                  steps: List[PreprocessingStep],
                  params: List[Dict[str, Any]],
                  model: Model,
                  evaluation: bool = False) -> pd.DataFrame:
    alphas = [1.028, 1.023, 1.018]
    weights = [1 / len(alphas)] * len(alphas)
    sub = 0.

    fday = datetime(2016, 4, 25)

    useless_cols = ["id", "date", "demand", "d", "wm_yr_wk"]

    max_lags = 57
    cols = [f"F{i}" for i in range(1, 29)]

    data.date = pd.to_datetime(data.date)

    pred_range = range(28, 56) if evaluation else range(0, 28)

    for icount, (alpha, weight) in enumerate(zip(alphas, weights)):
        for tdelta in pred_range:
            reset_dataframe_pivot_cache(data)
            _process_and_predict(data,
                                 model,
                                 steps,
                                 params,
                                 fday,
                                 tdelta,
                                 max_lags,
                                 useless_cols,
                                 alpha=alpha)

        te_sub = _to_submission_format(data)

        if icount == 0:
            sub = te_sub
            sub[cols] *= weight
        else:
            sub[cols] += te_sub[cols] * weight
        print(icount, alpha, weight)

    return sub
Пример #21
0
def test_deferred_with_groupby():

    # GH 12486
    # support deferred resample ops with groupby
    data = [
        ["2010-01-01", "A", 2],
        ["2010-01-02", "A", 3],
        ["2010-01-05", "A", 8],
        ["2010-01-10", "A", 7],
        ["2010-01-13", "A", 3],
        ["2010-01-01", "B", 5],
        ["2010-01-03", "B", 2],
        ["2010-01-04", "B", 1],
        ["2010-01-11", "B", 7],
        ["2010-01-14", "B", 3],
    ]

    df = DataFrame(data, columns=["date", "id", "score"])
    df.date = pd.to_datetime(df.date)

    def f(x):
        return x.set_index("date").resample("D").asfreq()

    expected = df.groupby("id").apply(f)
    result = df.set_index("date").groupby("id").resample("D").asfreq()
    tm.assert_frame_equal(result, expected)

    df = DataFrame(
        {
            "date": pd.date_range(start="2016-01-01", periods=4, freq="W"),
            "group": [1, 1, 2, 2],
            "val": [5, 6, 7, 8],
        }
    ).set_index("date")

    def f(x):
        return x.resample("1D").ffill()

    expected = df.groupby("group").apply(f)
    result = df.groupby("group").resample("1D").ffill()
    tm.assert_frame_equal(result, expected)
Пример #22
0
def regular_predict(data: pd.DataFrame,
                    steps: List[PreprocessingStep],
                    params: List[Dict[str, Any]],
                    model: Model,
                    evaluation: bool = False) -> pd.DataFrame:
    fday = datetime(2016, 4, 25)

    useless_cols = ["id", "date", "demand", "d", "wm_yr_wk"]

    max_lags = 57

    data.date = pd.to_datetime(data.date)

    pred_range = range(28, 56) if evaluation else range(0, 28)

    for tdelta in pred_range:
        reset_dataframe_pivot_cache(data)
        _process_and_predict(data, model, steps, params, fday, tdelta,
                             max_lags, useless_cols)

    sub = _to_submission_format(data)

    return sub
Пример #23
0
def nyt_county_normalize(df: pd.DataFrame) -> pd.DataFrame:

    df = df.rename(columns={'cases': 'confirmed'})

    # Change their dates to the date format we use
    # They use YYYY-MM-DD
    # We use M/D/YYYY
    def convert_nyt_date(date):
        old = datetime.strptime(date, '%Y-%m-%d')
        new = old.strftime('%-m/%-d/%y')
        return new

    df.date = df.date.map(convert_nyt_date)

    def split_nyt_data(df: pd.DataFrame) -> pd.DataFrame:
        deaths = df.loc[:, df.columns != 'cases']
        confirmed = df.loc[:, df.columns != 'deaths']
        return confirmed, deaths

    # Need to turn date column into a bunch of different columns for each state/county
    dates = list(set(df.date))
    dates.sort(key=lambda d: datetime.strptime(d, '%m/%d/%y'))

    # Split into two dataframes for cases and deaths
    confirmed, deaths = split_nyt_data(df)

    # Do the following for each of recovered and deaths
    def transpose_nyt_data(df: pd.DataFrame, expand: str) -> pd.DataFrame:
        join_on = ['county', 'state', 'fips']
        state_county = set(map(tuple, df[join_on].values))
        t = pd.DataFrame(state_county, columns=join_on)
        for date in dates:
            s = df[df.date == date][join_on + [expand]]
            s = s.rename(columns={expand: date})
            t = t.merge(s, on=join_on, how='left')
        return t

    confirmed = transpose_nyt_data(confirmed, expand='confirmed')
    deaths = transpose_nyt_data(deaths, expand='deaths')

    # make fips an str instead of a float
    confirmed = confirmed.astype({'fips': 'object'})
    deaths = deaths.astype({'fips': 'object'})

    confirmed.fips = confirmed.fips.astype('Int64').astype(str).str.zfill(5)
    deaths.fips = deaths.fips.astype('Int64').astype(str).str.zfill(5)

    table = pandemics.fetch.county_table()

    def geocode_nyt(df):
        df = pd.merge(df, table, how='left', on='fips')

        cols = df.columns.tolist()
        cols = cols[:3] + cols[-2:] + cols[3:-2]
        df = df[cols]

        date_cols = [col for col in df.columns if '/' in col]
        date_retype = {d: 'Int64' for d in date_cols}

        df = df.astype(date_retype)

        return df

    confirmed = geocode_nyt(confirmed)
    deaths = geocode_nyt(deaths)

    return confirmed, deaths
Пример #24
0
 def df_cleaner(df: pd.DataFrame) -> pd.DataFrame:
     # Convert to date
     df.date = pd.to_datetime(df.date).dt.date
     return df.replace({np.nan: None})
Пример #25
0
def get_plots(data, plots):
    # type: (List[dict], List[dict]) -> List[dcc.Graph]
    '''
    Gets a Dash plots using given dicts.
    Assumes dict element has all columns of table as keys.

    Args:
        data (list[dict]): List of dicts defining data.
        plots (list[dict]): List of dicts defining plots.

    Raises:
        EnforceError: If data is not a list of dicts.
        EnforceError: If plots is not a list of dicts.

    Returns:
        list[dcc.Graph]: Plots.
    '''
    msg = 'Data must be a list of dictionaries. Given value: {a}.'
    Enforce(data, 'instance of', list, message=msg)
    for item in data:
        Enforce(item, 'instance of', dict, message=msg)

    msg = 'Plots must be a list of dictionaries. Given value: {a}.'
    Enforce(plots, 'instance of', list, message=msg)
    for item in plots:
        Enforce(item, 'instance of', dict, message=msg)
# --------------------------------------------------------------------------

    data_ = DataFrame(data)
    if 'date' in data_.columns:
        data_.date = DatetimeIndex(data_.date)

    elems = []
    for i, x in enumerate(plots):
        plot = cfg.PlotItem(x)
        plot.validate()
        plot = plot.to_primitive()
        min_width = str(plot['min_width']) + '%'

        try:
            fig = sdt.get_figure(
                data_,
                filters=plot['filters'],
                group=plot['group'],
                pivot=plot['pivot'],
                **plot['figure'],
            )
            fig = dcc.Graph(
                id=f'plot-{i:02d}',
                className='plot',
                figure=fig,
                style={'min-width': min_width},
            )
        except (DataError, EnforceError):
            fig = html.Div(
                id=f'plot-{i:02d}',
                className='plot plot-error',
                style={'min-width': min_width},
                children=html.Div(
                    className='plot-error-container',
                    children=html.Div(
                        className='plot-error-message',
                        children='no data found'
                    )
                )
            )
        elems.append(fig)
    return elems
Пример #26
0
    entry.kd = entry.kills / (entry.deaths if entry.deaths > 0 else 1)
    flattened.append(entry)

p = figure(plot_width=1400,
           plot_height=600,
           x_axis_type='datetime',
           title="Kills per Match (avg)")

colors = color_gen()
sorted_by_name = sorted(flattened, key=lambda x: x.name)
for key, scores in groupby(sorted_by_name, key=lambda x: x.name):
    scores = [score.toDict() for score in scores]
    if len(scores) > 50:
        df = DataFrame(data=scores)

        df.date = to_datetime(df.date, format='%Y-%m-%d %H:%M:%S %Z')
        df.set_index('date', inplace=True)
        df = df.resample('1d').mean()
        df = df.rolling('30d').mean()
        df = df.interpolate()
        source = ColumnDataSource(df)

        p.line(x='date',
               y='kills',
               legend=key,
               source=source,
               color=next(colors),
               line_width=4)

show(p)
Пример #27
0
def prep_food_data(df: pd.DataFrame) -> pd.DataFrame:
    log('Prepping food data frame')
    clean_column_names(df)
    df.date = pd.to_datetime(df.date)
    df.calories_in = handle_commas(df.calories_in)
    return df.sort_values(by='date')
Пример #28
0
    def score_history(self, customer_interactions: DataFrame) -> DataFrame:
        """
            This method contain the logic to calculate brand gender score from a list of CustomerInteraction
            - brands purchased and added to the wish list are scored with specific weight
            - time decay is applied to the score
            - data is group by customer_id and score is sum
            It should receive as input a DataFrame with the following structure:
            ['product_id', 'date', 'brand_id', 'gender', 'views', 'purchased',
            'add_to_cart', 'add_to_wishlist', 'time_on_page']

            :return: DataFrame: ['memberID', 'b_g', 'total_hits']
        """

        if customer_interactions.size < 1:
            raise Exception('Can not score empty user interactions.')

        # Combine brand and gender into one column
        customer_interactions['b_g'] = customer_interactions \
            .apply(lambda x: str(x.brand_id) + ' ' + str(x.gender), axis=1)

        # ==================== Weight users-items interactions ============== #

        # set views value for product purchased (purchased==1)
        customer_interactions.loc[customer_interactions.purchased == 1, 'views'] = \
            self._config['p_weight'] * customer_interactions[customer_interactions.purchased == 1]['views']

        # set views value for product added to wishlist or cart but not purchased  (purchased!=1)
        customer_interactions.loc[((customer_interactions.add_to_wishlist == 1)
                                   | (customer_interactions.add_to_cart == 1))
                                  & (customer_interactions.purchased != 1), 'views'] = \
            self._config['w_weight'] * customer_interactions[((customer_interactions.add_to_wishlist == 1)
                                                              | (customer_interactions.add_to_cart == 1))
                                                             & (customer_interactions.purchased != 1)]['views']

        # ==================== Apply time decay function ==================== #

        # Convert string date to datetime
        customer_interactions.date = pd.to_datetime(customer_interactions.date)
        # add a new column decay_date on the data frame with the decay function
        last_browsing_date = pd.to_datetime('now')
        customer_interactions = customer_interactions.assign(
            decay_date=lambda x: (last_browsing_date - x.date).astype(
                'timedelta64[D]').astype('int'))

        # add a new column decay calculated from decay_date and views
        decay_rate = self._get_decay_rate()
        customer_interactions = customer_interactions.assign(
            decay=lambda x: x.views * np.exp(-decay_rate * x.decay_date))

        # ==================== Aggregate and shape dataset ================= #

        # Group the dataset by customer_id, brand, gender sum the decay and rename it to views
        customer_interactions = customer_interactions.groupby(['customer_id', 'b_g']) \
            .decay.sum() \
            .rename('views') \
            .reset_index()

        # remove unnecessary column
        customer_interactions = customer_interactions[[
            'customer_id', 'b_g', 'views'
        ]]
        # rename columns to match model expectation
        customer_interactions = customer_interactions.rename(
            columns={'customer_id': 'memberID'})
        customer_interactions = customer_interactions.rename(
            columns={'views': 'total_hits'})

        return customer_interactions
Пример #29
0
"""
from xml.etree.ElementTree import parse
from pandas import DataFrame, Series

doc = parse('generated-data\patient-613876.fhir-bundle.xml')

root = doc.getroot()

"""for item in doc.iterfind('feed/'):
    title = item.findtext('title')
    print title
    print doc
"""
#found = [element for element in doc.iter() if element.text == 'A']
#encounters = doc.findall('{http://hl7.org/fhir}Encounter')
#print encounters

encounter_dates = []
for encounter in doc.findall('.//{http://hl7.org/fhir}Encounter'):
    period = encounter.find('{http://hl7.org/fhir}period')
    start_date = period.find('{http://hl7.org/fhir}start')
    encounter_dates.append(start_date.get('value'))

#print len(encounter_dates)

enc_dates = DataFrame(encounter_dates, columns= ['date'])
enc_dates.date = enc_dates.date.astype("datetime64")
print enc_dates

enc_dates.groupby([enc_dates.date.dt.week, enc_dates.date.dt.year]).count().plot(kind="barh")
Пример #30
0
def change_by_sum(data: DataFrame,
                  metrics,
                  min_sum_allowed=None,
                  min_sum_sub=None) -> DataFrame:
    """

    Parameters
    ----------
    data
    metrics
    min_sum_allowed
    min_sum_sub

    All values in rolling sum that are smaller than ``min_sum_allowed`` are substituted
    with ``min_sum_sub``. The latter is expected to be smaller than the former to prevent
    conflicts. At the end of the process, all calculated columns carrying ``min_sum_sub``,
    including the metric column, are substituted with ``NaN`` - .

    Returns
    -------

    """
    metrics = set(metrics).intersection(data.columns)

    data.sort_values(["areaType", "areaCode", "date"],
                     ascending=[True, True, True],
                     inplace=True)

    logging.info(">> Starting to calculate the rolling metrics for")

    date_fmt = "%Y-%m-%d"
    date = "date"
    unique_loc_qualifiers = ["areaType", "areaCode"]
    unique_record_qualifiers = [*unique_loc_qualifiers, date]

    for col_name in metrics:
        rolling_sum_cols = [*unique_record_qualifiers, col_name]

        rolling_sum = f"{col_name}RollingSum"
        change = f"{col_name}Change"
        direction = f"{col_name}Direction"
        change_percentage = f"{col_name}ChangePercentage"

        # Local test
        # col_names.extend([col_name, rolling_sum, change, direction, change_percentage])

        logging.info(f"\t{col_name}")

        d = data.loc[:, rolling_sum_cols]
        d.loc[:, col_name] = d.loc[:, col_name].astype(float)

        if rolling_sum not in data.columns:
            df_rsum = (d.loc[:, rolling_sum_cols].pipe(
                col2datetime, col=date,
                format=date_fmt).groupby(unique_loc_qualifiers).rolling(
                    7, on=date).sum().rename(columns={
                        col_name: rolling_sum
                    }).reset_index(
                    ).loc[:, [*unique_record_qualifiers, rolling_sum]].pipe(
                        datetime2str, col=date,
                        format=date_fmt).set_index(unique_record_qualifiers))
            logging.info("\t\tCalculated rolling sum")

            try:
                data.date = data.date.map(lambda x: x.strftime(date_fmt))
            except AttributeError:
                # Already string
                pass

            if min_sum_allowed is not None:
                df_rsum.loc[df_rsum[rolling_sum] < min_sum_allowed,
                            rolling_sum] = min_sum_sub

            data = (data.set_index(unique_record_qualifiers).join(
                df_rsum, on=unique_record_qualifiers).reset_index())
            logging.info("\t\tJoined rolling sum to dataset")

        data.loc[:, rolling_sum] = (data.groupby(unique_loc_qualifiers)
                                    [rolling_sum].apply(replace_all_zero))
        logging.info(f"\t\tGrouped data by {unique_loc_qualifiers}")

        df_tmp = data.loc[:, [*unique_record_qualifiers, rolling_sum]]

        df_tmp = df_tmp.assign(
            **{
                change: (df_tmp.pipe(col2datetime, col=date, format=date_fmt).
                         loc[:, [*unique_loc_qualifiers, rolling_sum]].groupby(
                             unique_loc_qualifiers).diff(periods=7)),
                direction: (
                    df_tmp.pipe(col2datetime, col=date, format=date_fmt).
                    loc[:, [*unique_loc_qualifiers, rolling_sum]].groupby(
                        unique_loc_qualifiers).diff(
                            periods=7).pipe(get_directions, col=rolling_sum))
            })
        logging.info("\t\tCalculated rolling change (diff)")

        percentage_value = (
            df_tmp.pipe(
                col2datetime, col=date,
                format=date_fmt).loc[:,
                                     [*unique_record_qualifiers, rolling_sum]].
            groupby(unique_loc_qualifiers).rolling(
                window=8,
                on=date)[rolling_sum].apply(calculate_percentage_change).round(
                    1).to_frame(change_percentage))
        logging.info("\t\tCalculated percentage change")

        df_tmp = (df_tmp.join(
            percentage_value, on=unique_record_qualifiers).pipe(
                datetime2str, col=date,
                format=date_fmt).set_index(unique_record_qualifiers).
                  loc[:, [change, direction, change_percentage]])
        logging.info("\t\tJoined percentage to other rolling figures")

        data = (data.join(df_tmp,
                          on=unique_record_qualifiers).reset_index(drop=True))
        logging.info("\t\tJoined rolling figures to main dataset")

        data.loc[data.loc[:, col_name].isnull(),
                 [rolling_sum, change, direction, change_percentage]] = NaN

        logging.info("\t\tFinalised the data")

        if min_sum_allowed is not None:
            data.loc[
                data[rolling_sum] == min_sum_sub,
                [rolling_sum, change, direction, change_percentage, col_name
                 ]] = NaN

    return data
# <editor-fold desc="Description">
frmAllBus = frmTrnBus.append(frmTestBus)
frmAllChk = frmTrnChk.append(frmTestChk)

#get rid of unused
del frmTrnChk;del frmTestChk;del frmTrnBus;del frmTestBus
# </editor-fold>

#-------------------
#Data Cleaning
#-------------------

# <editor-fold desc="Description">
#convert any data types
#Review Date - unicode data into datetime
frmTrnRev.date = [datetime.strptime(date, '%Y-%m-%d') for date in frmTrnRev.date]
frmTestRev.date = [datetime.strptime(date, '%Y-%m-%d') for date in frmTestRev.date]

#Flatten any nested columns
#business categories
#user votes
frmTrnUser['votes_cool'] = [rec['cool'] for rec in frmTrnUser.votes]
frmTrnUser['votes_funny'] = [rec['funny'] for rec in frmTrnUser.votes]
frmTrnUser['votes_useful'] = [rec['useful'] for rec in frmTrnUser.votes]

#review votes
frmTrnRev['votes_cool'] = [rec['cool'] for rec in frmTrnRev.votes]
frmTrnRev['votes_funny'] = [rec['funny'] for rec in frmTrnRev.votes]
frmTrnRev['votes_useful'] = [rec['useful'] for rec in frmTrnRev.votes]

#Other misc cleaning