Пример #1
0
def augment_with_pluses(dataframe: pd.Dataframe, usd_is_1: pd.Series,
                        usd_is_2: pd.Series):
    """Augment DataFrame with bool feature flagging whether currency amount strings contain '+'"""
    find_plus = lambda elem: str(elem).find('+')
    plus_1 = dataframe['ROUNDED_NOTIONAL_AMOUNT_1'].astype(str).apply(
        find_plus) != -1
    plus_2 = dataframe['ROUNDED_NOTIONAL_AMOUNT_2'].astype(str).apply(
        find_plus) != -1
    dataframe.loc[:, 'PLUS_USD'] = (usd_is_1 & plus_1) | (usd_is_2 & plus_2)
    dataframe.loc[:, 'PLUS_CCY'] = (usd_is_2 & plus_1) | (usd_is_1 & plus_2)
Пример #2
0
def remove_dissemination_id_changes(dataframe: pd.Dataframe):
    """Drops rows in pandas.DataFrame with updated DISSEMINATION_ID information"""
    n_corrections = len(dataframe[dataframe['ACTION'] == 'CORRECT'])
    n_cancels = len(dataframe[dataframe['ACTION'] == 'CANCEL'])
    to_drop = []
    print(f'There have been {n_cancels} cancels and '
          f'{n_corrections} corrections in dissemination IDs')
    for row_idx, row in dataframe.iterrows():
        if row['ACTION'] in ['CORRECT', 'CANCEL']:
            o_id = row['ORIGINAL_DISSEMINATION_ID']
            o_id = int(o_id)
            if o_id in dataframe.index:
                to_drop.append(o_id)
    if len(to_drop) > 0:
        dataframe = dataframe.drop(to_drop, axis=0)
    return dataframe
Пример #3
0
 def get_data(data: pd.Dataframe) -> pd.Dataframe:
     with zipfile.ZipFile(countpath) as z:
         data["text"] = data.apply(
             lambda row: [
                 s.strip().decode("utf-8").split("\t")
                 for s in z.open(row.path, "r").readlines()
             ],
             axis=1,
         )
     data = data.explode("text")
     data[["word", "count"]] = data["text"].tolist()
     data.drop(columns=["text", "path"], inplace=True)
     data["count"] = data["count"].astype(int)
     if filter is not None:
         data = data[data["word"].map(filter)]
     return data
Пример #4
0
def amounts_to_ndf_rate(dataframe: pd.Dataframe, usd_is_1: pd.Series,
                        usd_is_2: pd.Series) -> None:
    """Computes NDF rates from notional amounts and augments `dataframe` with an NDF rate column"""
    dataframe.loc[usd_is_1,
                  'CURRENCY'] = dataframe[usd_is_1]['NOTIONAL_CURRENCY_2']
    dataframe.loc[usd_is_2,
                  'CURRENCY'] = dataframe[usd_is_2]['NOTIONAL_CURRENCY_1']

    dataframe.loc[usd_is_1,
                  'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1']
    dataframe.loc[usd_is_2,
                  'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2']
    dataframe.loc[usd_is_2,
                  'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1']
    dataframe.loc[usd_is_1,
                  'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2']

    dataframe.loc[:, 'NDF_RATE'] = dataframe['CCY_AMOUNT'] / dataframe[
        'USD_AMOUNT']
Пример #5
0
def _adjust_tstamp_drift_of_triplet(df: pd.Dataframe) -> List[pd.DataFrame]:
    """Return list of pandas DataFrames where timestamp offsets has been adjusted.

    Sorts dataframe based on timestamp, finds triplets where timestamp is equal +-2, and
    adjusts any timestamps +-2 from 2nd timestamp to be equal to 2nd timestamp. Returns
    a list of all valid triplets.

    Args:
        df: pd.DataFrame where columns "timestamp" and "millisecond" are used to adjust.

    Returns:
        Returns list of pd.DataFrame where timestamps offset +-2 from middle timestamp
        is adjusted. For example:

        | timestamp  | millisecond | frequency | tagID | tagData |
        | 1556555369 |     995     |     69    |   12  |   3.5   |
        | 1556555370 |     005     |     69    |   12  |   3.5   |
        | 1556555371 |     010     |     69    |   12  |   3.5   |

        becomes -->

        | timestamp  | millisecond | frequency | tagID | tagData |
        | 1556555370 |     995     |     69    |   12  |   3.5   |
        | 1556555370 |     005     |     69    |   12  |   3.5   |
        | 1556555370 |     010     |     69    |   12  |   3.5   |
    """
    ts_drift_threshold = 2
    ms_1km = 0.667

    # Sort dataframe by timestamps in case some timestamps are in the wrong order
    df = df.sort_values("timestamp")
    df = df.reset_index(drop=True)

    # Extract timestamps and find all triplets within dataframe
    ts = df["timestamp"]
    last_indices = ts.index[ts.diff(periods=2) <= ts_drift_threshold]
    all_indices = last_indices.append([last_indices - 1,
                                       last_indices - 2]).sort_values()

    # Mask out all detections that aren't triplets
    mask_values = [i for i in range(len(last_indices)) for _ in range(3)]
    df.loc[all_indices, "mask"] = mask_values
    df = df[df["mask"].notnull()]
    if df.empty:
        return []

    # Adjust timestamps that have drifted
    # | if 2nd timestamp in triplet is much larger than the 1st, add 2nd index to list
    # | if 3rd timestamp in triplet is much larger than the 2nd, add 2nd index to list
    df["drift"] = df.apply(lambda x: x["timestamp"] + x["millisecond"] / 1000,
                           axis=1)
    drift = df["drift"].diff()
    drift_3rd = drift[last_indices].where(abs(drift[last_indices]) >= ms_1km)
    drift_1st = drift[last_indices -
                      1].where(abs(drift[last_indices - 1]) >= ms_1km)
    drift_indices = drift_3rd.dropna(
    ).index - 1  # -1 to get index of 2nd timestamp
    drift_indices = drift_indices.append(drift_1st.dropna().index)

    # Set timestamp 1 and 3 of each triplet with drift has equal to 2nd timestamp
    df.loc[drift_indices - 1, "timestamp"] = ts[drift_indices].values
    df.loc[drift_indices + 1, "timestamp"] = ts[drift_indices].values

    # get and return triplets as list of dataframes
    triplets = [
        v.drop(["mask", "drift"], axis=1) for _, v in df.groupby("mask")
    ]
    # triplets = [v.drop(["mask"], axis=1) for _, v in df.groupby("mask")]
    return triplets
Пример #6
0
import pandas as pd
import numpy as np
from pandas import Series, Dataframe
from numpy.random import randn
sinhvien = ['Teo', 'Ty', 'Tun', 'Tuan', 'Tien']
data = {'diem': diem, 'sinhvien': sinhvien}
df2 = Dataframe(data)  #chuyển diction thành dataframe

indx = 'A B C D E'.split()
cols = 'Col1 Col2 Col3 Col4 Col5'.split()
x = []
for i in range(25):
    x.append(np.random.randint(1, 100))
x = np.array(x)
x = x.reshape(5, 5)

df3 = DataFrame(x, index=indx, columns=cols)
#reindex row
newind = 'A B C D E F G'.split()
df4 = df3.reindex(newind, fill_value=0)
#reindex columns
cols = 'Col1 Col2 Col3 Col4 Col5 col6 col7'.split()
df4 = df3.reindex(columns=cols, fill_value=0)
df4
def substitude_row(dataset: Dataframe, repo_name: str,
                   new_row: List[str]) -> None:
    dataset.drop(labels=dataset[dataset['repo_name'] == repo_name].index,
                 inplace=True)
    dataset = dataset.append(other=new_row)
def find_incomplete_rows(dataset: Dataframe) -> Series:
    return dataset[dataset.isna().any(axis=1)]['repo_name']
def save_to_csv(dataset: Dataframe, file_name: str) -> None:
    dataset.to_csv(file_name, index=False, header=True)