def augment_with_pluses(dataframe: pd.Dataframe, usd_is_1: pd.Series, usd_is_2: pd.Series): """Augment DataFrame with bool feature flagging whether currency amount strings contain '+'""" find_plus = lambda elem: str(elem).find('+') plus_1 = dataframe['ROUNDED_NOTIONAL_AMOUNT_1'].astype(str).apply( find_plus) != -1 plus_2 = dataframe['ROUNDED_NOTIONAL_AMOUNT_2'].astype(str).apply( find_plus) != -1 dataframe.loc[:, 'PLUS_USD'] = (usd_is_1 & plus_1) | (usd_is_2 & plus_2) dataframe.loc[:, 'PLUS_CCY'] = (usd_is_2 & plus_1) | (usd_is_1 & plus_2)
def remove_dissemination_id_changes(dataframe: pd.Dataframe): """Drops rows in pandas.DataFrame with updated DISSEMINATION_ID information""" n_corrections = len(dataframe[dataframe['ACTION'] == 'CORRECT']) n_cancels = len(dataframe[dataframe['ACTION'] == 'CANCEL']) to_drop = [] print(f'There have been {n_cancels} cancels and ' f'{n_corrections} corrections in dissemination IDs') for row_idx, row in dataframe.iterrows(): if row['ACTION'] in ['CORRECT', 'CANCEL']: o_id = row['ORIGINAL_DISSEMINATION_ID'] o_id = int(o_id) if o_id in dataframe.index: to_drop.append(o_id) if len(to_drop) > 0: dataframe = dataframe.drop(to_drop, axis=0) return dataframe
def get_data(data: pd.Dataframe) -> pd.Dataframe: with zipfile.ZipFile(countpath) as z: data["text"] = data.apply( lambda row: [ s.strip().decode("utf-8").split("\t") for s in z.open(row.path, "r").readlines() ], axis=1, ) data = data.explode("text") data[["word", "count"]] = data["text"].tolist() data.drop(columns=["text", "path"], inplace=True) data["count"] = data["count"].astype(int) if filter is not None: data = data[data["word"].map(filter)] return data
def amounts_to_ndf_rate(dataframe: pd.Dataframe, usd_is_1: pd.Series, usd_is_2: pd.Series) -> None: """Computes NDF rates from notional amounts and augments `dataframe` with an NDF rate column""" dataframe.loc[usd_is_1, 'CURRENCY'] = dataframe[usd_is_1]['NOTIONAL_CURRENCY_2'] dataframe.loc[usd_is_2, 'CURRENCY'] = dataframe[usd_is_2]['NOTIONAL_CURRENCY_1'] dataframe.loc[usd_is_1, 'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1'] dataframe.loc[usd_is_2, 'USD_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2'] dataframe.loc[usd_is_2, 'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_1'] dataframe.loc[usd_is_1, 'CCY_AMOUNT'] = dataframe['ROUNDED_NOTIONAL_AMOUNT_2'] dataframe.loc[:, 'NDF_RATE'] = dataframe['CCY_AMOUNT'] / dataframe[ 'USD_AMOUNT']
def _adjust_tstamp_drift_of_triplet(df: pd.Dataframe) -> List[pd.DataFrame]: """Return list of pandas DataFrames where timestamp offsets has been adjusted. Sorts dataframe based on timestamp, finds triplets where timestamp is equal +-2, and adjusts any timestamps +-2 from 2nd timestamp to be equal to 2nd timestamp. Returns a list of all valid triplets. Args: df: pd.DataFrame where columns "timestamp" and "millisecond" are used to adjust. Returns: Returns list of pd.DataFrame where timestamps offset +-2 from middle timestamp is adjusted. For example: | timestamp | millisecond | frequency | tagID | tagData | | 1556555369 | 995 | 69 | 12 | 3.5 | | 1556555370 | 005 | 69 | 12 | 3.5 | | 1556555371 | 010 | 69 | 12 | 3.5 | becomes --> | timestamp | millisecond | frequency | tagID | tagData | | 1556555370 | 995 | 69 | 12 | 3.5 | | 1556555370 | 005 | 69 | 12 | 3.5 | | 1556555370 | 010 | 69 | 12 | 3.5 | """ ts_drift_threshold = 2 ms_1km = 0.667 # Sort dataframe by timestamps in case some timestamps are in the wrong order df = df.sort_values("timestamp") df = df.reset_index(drop=True) # Extract timestamps and find all triplets within dataframe ts = df["timestamp"] last_indices = ts.index[ts.diff(periods=2) <= ts_drift_threshold] all_indices = last_indices.append([last_indices - 1, last_indices - 2]).sort_values() # Mask out all detections that aren't triplets mask_values = [i for i in range(len(last_indices)) for _ in range(3)] df.loc[all_indices, "mask"] = mask_values df = df[df["mask"].notnull()] if df.empty: return [] # Adjust timestamps that have drifted # | if 2nd timestamp in triplet is much larger than the 1st, add 2nd index to list # | if 3rd timestamp in triplet is much larger than the 2nd, add 2nd index to list df["drift"] = df.apply(lambda x: x["timestamp"] + x["millisecond"] / 1000, axis=1) drift = df["drift"].diff() drift_3rd = drift[last_indices].where(abs(drift[last_indices]) >= ms_1km) drift_1st = drift[last_indices - 1].where(abs(drift[last_indices - 1]) >= ms_1km) drift_indices = drift_3rd.dropna( ).index - 1 # -1 to get index of 2nd timestamp drift_indices = drift_indices.append(drift_1st.dropna().index) # Set timestamp 1 and 3 of each triplet with drift has equal to 2nd timestamp df.loc[drift_indices - 1, "timestamp"] = ts[drift_indices].values df.loc[drift_indices + 1, "timestamp"] = ts[drift_indices].values # get and return triplets as list of dataframes triplets = [ v.drop(["mask", "drift"], axis=1) for _, v in df.groupby("mask") ] # triplets = [v.drop(["mask"], axis=1) for _, v in df.groupby("mask")] return triplets
import pandas as pd import numpy as np from pandas import Series, Dataframe from numpy.random import randn sinhvien = ['Teo', 'Ty', 'Tun', 'Tuan', 'Tien'] data = {'diem': diem, 'sinhvien': sinhvien} df2 = Dataframe(data) #chuyển diction thành dataframe indx = 'A B C D E'.split() cols = 'Col1 Col2 Col3 Col4 Col5'.split() x = [] for i in range(25): x.append(np.random.randint(1, 100)) x = np.array(x) x = x.reshape(5, 5) df3 = DataFrame(x, index=indx, columns=cols) #reindex row newind = 'A B C D E F G'.split() df4 = df3.reindex(newind, fill_value=0) #reindex columns cols = 'Col1 Col2 Col3 Col4 Col5 col6 col7'.split() df4 = df3.reindex(columns=cols, fill_value=0) df4
def substitude_row(dataset: Dataframe, repo_name: str, new_row: List[str]) -> None: dataset.drop(labels=dataset[dataset['repo_name'] == repo_name].index, inplace=True) dataset = dataset.append(other=new_row)
def find_incomplete_rows(dataset: Dataframe) -> Series: return dataset[dataset.isna().any(axis=1)]['repo_name']
def save_to_csv(dataset: Dataframe, file_name: str) -> None: dataset.to_csv(file_name, index=False, header=True)