示例#1
0
def get_first_barrier_touches(sep_featured, sep, ptSl: tuple, trgt: pd.Series,
                              minRet: float):
    ewmstd = sep_featured[["ewmstd"]]

    # I only consider the case there there is a timeout
    timeout = sep_featured[["timeout"]]  # NEED TO ADD

    side_long = pd.Series(1.0, index=ewmstd.index)

    # 3) form events objects, apply stop loss on t1
    events = pd.concat(
        {
            'timeout': timeout,
            'ewmstd': ewmstd,
            'side': side_long
        }, axis=1)  # .dropna(subset=["trgt"])


    sep_featured = pandas_mp_engine(callback=triple_barrier_search, atoms=sep_featured, \
        data={'sep': sep}, molecule_key='sep_sampled', split_strategy= 'ticker', \
            num_processes=1, molecules_per_process=1, ptSl=[1,-1], minRet=None)

    # drop those where none of the barrieres where touched (should be extremly rare, if not at all I think)
    events["earliest_touch"] = df0.dropna(how='all').min(
        axis=1
    )  # pd.min ignores nan, here events["t1"] becomes the timesamp of the earliest barrier touch
    events = events.drop("side", axis=1)

    return events
示例#2
0
def get_events_metalabaling(close,
                            tEvents,
                            ptSl,
                            trgt,
                            minRet,
                            numThreads,
                            t1=False,
                            side=None):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet

    # 2) Get t1 (max holding period)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # 3) form events object, apply stop loss on t1
    if side is None:
        side_, ptSl = pd.Series(1.0, index=trgt.index), [ptSl[0], ptSl[0]]
    else:
        side_, ptSl_ = side.loc[trgt.index], ptSl[:2]

    events = pd.concat({
        "t1": t1,
        "trgt": trgt,
        "side": side_
    }, axis=1).dropna(subset=["trgt"])
    df0 = pandas_mp_engine(callback=apply_ptsl_on_t1, pdObj=("molecule", events.index), numThreads=numThreads, \
        close=inst["close"], events=events, ptSl=ptSl_)

    events["t1"] = df0.dropna(how="all").min(axis=1)  # pd.min ignores nan
    if side is None:
        events = events.drop("side", axis=1)

    return events
示例#3
0
def getEvents(close: pd.Series,
              tEvents: pd.DatetimeIndex,
              ptSl: tuple,
              trgt: pd.Series,
              minRet: float,
              numThreads: int,
              t1=False):
    # sep_featured, sep, ptSl, minRet
    """
    Finds the time of the first barrier touch.

    close:      A pandas series of prices.
    tEvents:    The pandas timeindex containing the timestamps that will seed every triple barrier. 
                These are the timestamps selected by the sampling procedures discussed in Chapter 2, 
                Section 2.5.
    ptSl:       A non-negative float that sets the width of the two barriers. A 0 value means that the 
                respective horizontal barrier (profit taking and/or stop loss) will be disabled.
    t1:         A pandas series with the timestamps of the vertical barriers. We pass a False 
                when we want to disable vertical barriers.
    trgt:       A pandas series of targets, expressed in terms of absolute returns.
    minRet:     The minimum target return required for running a triple barrier search.
    """

    # Some of this preparation migth be done before giving it to this function, don't know...

    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[
        trgt >
        minRet]  # minRet # what is the result of this (sets those lower than minRet to NAN i think)

    # 2) Get t1 (max holding period)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # 3) form events objects, apply stop loss on t1

    side_ = pd.Series(
        1.0, index=trgt.index
    )  # ALlways assume long position when first calculating label for side.
    events = pd.concat({
        't1': t1,
        'trgt': trgt,
        'side': side_
    }, axis=1).dropna(subset=["trgt"])

    df0 = pandas_mp_engine(func=apply_ptsl_on_t1, pdObj=('molecule', events.index), numThreads=numThreads, \
        close=close, events=events, ptSl=[ptSl, ptSl]) # notice that barriers are symmetric when first labeling for side.

    # drop those where none of the barrieres where touched (should be extremly rare, if not at all I think)
    events["t1"] = df0.dropna(how='all').min(
        axis=1
    )  # pd.min ignores nan, here events["t1"] becomes the timesamp of the earliest barrier touch
    events = events.drop("side", axis=1)

    return events
def test_sep_featured():
    global save_path, cache_dir
    num_processes = 6

    print("\n\nSEP_FEATURED - OLD METHOD\n\n")

    sep = pd.read_csv("../datasets/testing/sep.csv",
                      parse_dates=["date"],
                      index_col="date",
                      low_memory=False)

    sf1_art = pd.read_csv("../datasets/testing/sf1_art.csv", parse_dates=["calendardate", "datekey", "reportperiod"],\
        index_col="calendardate", low_memory=False)

    metadata = pd.read_csv("../datasets/sharadar/METADATA_PURGED.csv",
                           parse_dates=["firstpricedate"],
                           low_memory=False)

    tb_rate = tb_rate = pd.read_csv("../datasets/macro/t_bill_rate_3m.csv",
                                    parse_dates=["date"],
                                    index_col="date")

    sep_extended = pandas_mp_engine(callback=extend_sep_for_sampling, atoms=sep, \
                data={"sf1_art": sf1_art, "metadata": metadata}, \
                    molecule_key='sep', split_strategy='ticker', \
                        num_processes=num_processes, molecules_per_process=1)

    sep_extended.sort_values(by=["ticker", "date"],
                             ascending=True,
                             inplace=True)

    sep_adjusted = pandas_mp_engine(callback=dividend_adjusting_prices_backwards, atoms=sep_extended, data=None, \
        molecule_key='sep', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1)

    sep_adjusted_plus_returns = pandas_mp_engine(callback=add_weekly_and_12m_stock_returns, atoms=sep_adjusted, data=None, \
        molecule_key='sep', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1)

    sep_adjusted_plus_returns.sort_values(by=["ticker", "date"],
                                          ascending=True,
                                          inplace=True)

    sep_prepared = pandas_mp_engine(callback=add_equally_weighted_weekly_market_returns, atoms=sep_adjusted_plus_returns, data=None, \
        molecule_key='sep', split_strategy= 'date', \
            num_processes=num_processes, molecules_per_process=1)

    sep_prepared.sort_values(by=["ticker", "date"],
                             ascending=True,
                             inplace=True)

    sep_prepared_plus_indmom = pandas_mp_engine(callback=add_indmom, atoms=sep_prepared, data=None, \
        molecule_key='sep', split_strategy= 'industry', \
            num_processes=num_processes, molecules_per_process=1)

    sep_prepared_plus_indmom.sort_values(by=["ticker", "date"], inplace=True)

    # sep_prepared_plus_indmom.to_csv("../datasets/testing/sep_prepared.csv")

    sep_sampled = pandas_mp_engine(callback=rebase_at_each_filing_sampling, atoms=sep_prepared_plus_indmom, data=None, \
        molecule_key='observations', split_strategy='ticker', num_processes=num_processes, molecules_per_process=1, \
            days_of_distance=20)

    sep_sampled.sort_values(by=["ticker", "date"],
                            ascending=True,
                            inplace=True)


    sep_featured = pandas_mp_engine(callback=add_sep_features, atoms=sep_sampled, \
        data={'sep': sep_prepared_plus_indmom, "sf1_art": sf1_art}, molecule_key='sep_sampled', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1)

    sep_featured.sort_values(by=["ticker", "date"],
                             ascending=True,
                             inplace=True)


    tbm_labeled_sep = pandas_mp_engine(callback=add_labels_via_triple_barrier_method, atoms=sep_featured, \
        data={'sep': sep_prepared_plus_indmom}, molecule_key='sep_featured', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1, ptSl=[1, -1], min_ret=None)

    tbm_labeled_sep.sort_values(by=["ticker", "date"],
                                ascending=True,
                                inplace=True)

    erp_labeled_sep = pandas_mp_engine(callback=equity_risk_premium_labeling, atoms=tbm_labeled_sep, \
        data=None, molecule_key='sep_featured', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1, tb_rate=tb_rate)

    erp_labeled_sep.sort_values(by=["ticker", "date"],
                                ascending=True,
                                inplace=True)

    sep_featured = erp_labeled_sep

    sep_featured.sort_values(by=["ticker", "date"], inplace=True)

    sep_featured.to_csv(
        save_path +
        "/sep_featured.csv")  # I really think this is the correct result

    #______________________CHAINING MP ENGINE____________________________

    print("\n\nSEP_FEATURED - NEW METHOD\n\n")

    sep_featured_2 = generate_sep_featured(
        num_processes=num_processes,
        cache_dir=cache_dir,
        tb_rate=tb_rate,
        sep_path=
        "../datasets/testing/sep.csv",  # paths relative to the engine I think
        sf1_art_path="../datasets/testing/sf1_art.csv",
        metadata_path="../datasets/sharadar/METADATA_PURGED.csv",
        resume=False)

    sep_featured_2 = sep_featured_2.sort_values(by=["ticker", "date"
                                                    ])  # Should not need this

    sep_featured_2.to_csv(save_path + "/sep_featured_2.csv")
    """
    sep_featured = sep_featured.fillna("NA")
    sep_featured_2 = sep_featured_2.fillna("NA")
    eq_result = sep_featured.eq(sep_featured_2)
    eq_result.to_csv("./testing_datasets/eq_result_sep_featured.csv")
    """

    assert sep_featured.shape[0] == sep_featured_2.shape[0]
    assert sep_featured.shape[1] == sep_featured_2.shape[1]

    failed = False
    pos = None
    errors = []
    len_sep_featured = len(sep_featured)

    for index in range(0, len_sep_featured):
        for column in sep_featured.columns:
            correct_val = sep_featured.iloc[index][column]
            if isinstance(correct_val, str):
                if correct_val != sep_featured_2.iloc[index][column]:
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            elif isinstance(correct_val, pd.Timestamp) or isinstance(
                    correct_val, pd.Timedelta):
                if str(correct_val) != str(sep_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            elif math.isnan(correct_val):
                if not math.isnan(sep_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            else:
                if correct_val != pytest.approx(
                        sep_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)

    if failed == True:
        print("Shape: ", sep_featured.shape, sep_featured_2.shape)
        for pos in errors:
            print("Failed at position: ", pos, " Corr: ",
                  sep_featured.iloc[pos[0]][pos[1]], "Othr: ",
                  sep_featured_2.iloc[pos[0]][pos[1]])

    assert len(errors) == 0
def testing_sf1_featured():
    global save_path, cache_dir
    num_processes = 6

    print("\n\nSF1_FEATURED - OLD METHOD\n\n")

    sf1_art = pd.read_csv("../datasets/testing/sf1_art_no_duplicates.csv", parse_dates=["calendardate", "datekey"],\
        index_col="calendardate", low_memory=False)

    sf1_arq = pd.read_csv("../datasets/testing/sf1_arq_no_duplicates.csv", parse_dates=["calendardate", "datekey"],\
        index_col="calendardate", low_memory=False)

    metadata = pd.read_csv("../datasets/sharadar/METADATA_PURGED.csv",
                           parse_dates=["firstpricedate"],
                           low_memory=False)

    sf1_art = sf1_art.sort_values(by=["ticker", "calendardate", "datekey"])

    sf1_arq = sf1_arq.sort_values(by=["ticker", "calendardate", "datekey"])

    sf1_featured = pandas_mp_engine(callback=add_sf1_features, atoms=sf1_art, \
        data={"sf1_arq": sf1_arq, 'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'ticker', \
            num_processes=num_processes, molecules_per_process=1)

    sf1_featured.sort_values(by=["ticker", "calendardate", "datekey"])

    sf1_featured = pandas_mp_engine(callback=add_industry_sf1_features, atoms=sf1_featured, \
        data={'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'industry', \
            num_processes=num_processes, molecules_per_process=1)

    sf1_featured = sf1_featured.sort_values(
        by=["ticker", "calendardate", "datekey"])

    sf1_featured.to_csv(save_path + "/sf1_featured.csv")

    print("\n\nSF1_FEATURED - NEW METHOD\n\n")

    sf1_featured_2 = generate_sf1_featured(
        num_processes=num_processes,
        cache_dir=cache_dir,
        sf1_art_path="../datasets/testing/sf1_art_no_duplicates.csv",
        sf1_arq_path="../datasets/testing/sf1_arq_no_duplicates.csv",
        metadata_path="../datasets/sharadar/METADATA_PURGED.csv",
        resume=False)

    sf1_featured_2 = sf1_featured_2.sort_values(
        by=["ticker", "calendardate", "datekey"])
    sf1_featured_2.to_csv(save_path + "/sf1_featured_2.csv")

    assert sf1_featured.shape[0] == sf1_featured_2.shape[0]
    assert sf1_featured.shape[1] == sf1_featured_2.shape[1]

    failed = False
    pos = None
    errors = []
    len_sf1_featured = len(sf1_featured)

    for index in range(0, len_sf1_featured):
        for column in sf1_featured.columns:
            correct_val = sf1_featured.iloc[index][column]
            if isinstance(correct_val, str):
                if correct_val != sf1_featured_2.iloc[index][column]:
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            elif isinstance(correct_val, pd.Timestamp) or isinstance(
                    correct_val, pd.Timedelta):
                if str(correct_val) != str(sf1_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            elif math.isnan(correct_val):
                if not math.isnan(sf1_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)
            else:
                if correct_val != pytest.approx(
                        sf1_featured_2.iloc[index][column]):
                    failed = True
                    pos = (index, column)
                    errors.append(pos)

    if failed == True:
        print("Shape: ", sf1_featured.shape, sf1_featured_2.shape)
        for pos in errors:
            print("Failed at position: ", pos, " Corr: ", sf1_featured.iloc[pos[0]][pos[1]], "Othr: ", \
                sf1_featured_2.iloc[pos[0]][pos[1]], sf1_featured.iloc[pos[0]]["datekey"], sf1_featured.iloc[pos[0]]["ticker"], \
                    sf1_featured_2.iloc[pos[0]]["datekey"], sf1_featured_2.iloc[pos[0]]["ticker"])

    assert len(errors) == 0
示例#6
0
        return ps
    else:
        return np.nan


if __name__ == "__main__":
    sf1_art = pd.read_csv("./datasets/testing/sf1_art.csv", parse_dates=["datekey", \
        "calendardate", "reportperiod"], index_col="calendardate")
    sf1_art = sf1_art.sort_values(by="datekey", ascending=True)

    sf1_art["datekey"] = sf1_art.index

    sf1_arq = pd.read_csv("./datasets/testing/sf1_arq.csv", parse_dates=["datekey", \
        "calendardate", "reportperiod"], index_col="calendardate")
    sf1_arq = sf1_arq.sort_values(by="datekey", ascending=True)

    metadata = pd.read_csv("./datasets/sharadar/SHARADAR_TICKERS_METADATA.csv", \
        parse_dates=["firstpricedate"])


    sf1_art_featured = pandas_mp_engine(callback=add_sf1_features, atoms=sf1_art, \
        data={"sf1_arq": sf1_arq, 'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'ticker', \
            num_processes=1, molecules_per_process=1)

    sf1_art_aapl = sf1_art_featured.loc[sf1_art_featured.ticker == "AAPL"]
    sf1_art_ntk = sf1_art_featured.loc[sf1_art_featured.ticker == "NTK"]

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print(sf1_art_aapl)
示例#7
0
def finalize_dataset(metadata, sep_featured=None, sf1_featured=None, num_processes=6):

    sf1_featured = sf1_featured.drop_duplicates(subset=["ticker", "datekey"], keep="last")

    # 2. Select features from SEP, SF1 etc.
    selected_features = base_cols + selected_sf1_features + selected_industry_sf1_features + selected_sep_features

    dataset = merge_datasets(sep_featured, sf1_featured, selected_features)
    
    # 3. Make all values numeric:
    dataset["age"] = pd.to_timedelta(dataset["age"])
    dataset["age"] = dataset["age"].dt.days # pd.to_numeric(dataset["age"].apply())

    # dataset.to_csv("./datasets/ml_ready_live/dataset_with_nans.csv", index=False)

    """
    merged_length = len(dataset)
    merged_cols = set(dataset.columns)

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print("Nan status after merge")
        print("Dataset length: ", merged_length)
        print(dataset.isnull().sum())
    """ 

    # 2. Drop columns with too many missing values
    columns_to_drop = ["saleinv", "pchsale_pchinvt", "pchsaleinv", "rd", "herf"]
    dataset = dataset.drop(columns_to_drop, axis=1)

    dataset = dataset.replace([np.inf, -np.inf], np.nan)
    
    features = list(set(dataset.columns) - set(labels) - set(base_cols) - set(["industry"]))

    # 4. Calculate mean and var for each feature for each size category for the whole market
    # Size classifications: Nano <$50m; 2 - Micro < $300m; 3 - Small < $2bn; 4 - Mid <$10bn; 5 - Large < $200bn; 6 - Mega >= $200bn
    
    dataset = dataset.dropna(axis=0, subset=["mve"])

    dataset["size"] = pd.NaT
    dataset["size"].loc[dataset.mve < math.log(50e6)] = "nano"
    dataset["size"].loc[(dataset.mve >= math.log(50e6)) & (dataset.mve < math.log(300e6))] = "micro"
    dataset["size"].loc[(dataset.mve >= math.log(300e6)) & (dataset.mve < math.log(2e9))] = "small"
    dataset["size"].loc[(dataset.mve >= math.log(2e9)) & (dataset.mve < math.log(10e9))] = "mid"
    dataset["size"].loc[(dataset.mve >= math.log(10e9)) & (dataset.mve < math.log(200e9))] = "large"
    dataset["size"].loc[dataset.mve >= math.log(200e9)] = "mega"

    nano_dataset = dataset.loc[dataset["size"] == "nano"]
    micro_dataset = dataset.loc[dataset["size"] == "micro"]
    small_dataset = dataset.loc[dataset["size"] == "small"]
    mid_dataset = dataset.loc[dataset["size"] == "mid"]
    large_dataset = dataset.loc[dataset["size"] == "large"]
    mega_dataset = dataset.loc[dataset["size"] == "mega"]

    print(features)
    size_rvs = {}
    for feature in features:
        size_rvs[feature] = {
            "nano": (nano_dataset[feature].mean(), nano_dataset[feature].std()),
            "micro": (micro_dataset[feature].mean(), micro_dataset[feature].std()),
            "small": (small_dataset[feature].mean(), small_dataset[feature].std()),
            "mid": (mid_dataset[feature].mean(), mid_dataset[feature].std()),
            "large": (large_dataset[feature].mean(), large_dataset[feature].std()),
            "mega": (mega_dataset[feature].mean(), mega_dataset[feature].std()),
        }


    # 5. Fix Nans and drop rows    
    dataset = pandas_mp_engine(
        callback=fix_nans_and_drop_rows, 
        atoms=dataset, 
        data={"metadata": metadata}, 
        molecule_key="dataset",
        split_strategy="industry_new",
        num_processes=num_processes, 
        molecules_per_process=1, 
        features=features, 
        size_rvs=size_rvs
    )
    
    dataset["erp_1m_direction"] = np.sign(dataset["erp_1m"])

    dataset = dataset.loc[dataset.primary_label_tbm != 0]

    """
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print("\n\nNan Status After fixing Nans:")
        print("New dataset length: ", len(dataset))
        print("Percentage dropped: ", ((merged_length - len(dataset))/merged_length) * 100)
        print("Dropped columns: ", merged_cols.difference(set(dataset.columns)))
        print(dataset.isnull().sum())
        print(dataset.describe())
    """

    return dataset
        i8_advertising_intensity = 1 if (
            (art_row_cur["sgna"] / art_row_cur["assetsavg"]) >
            industry_means.at[caldate_cur,
                              "industry_mean_advertising_intensity"]) else 0

        ms = i1_roa_above_avg + i2_cf_roa_above_avg + i3_ncfo_exceeds_netinc + i6_rnd_intensity + i7_capex_indensity + i8_advertising_intensity

        return ms

    else:
        return np.nan


if __name__ == "__main__":
    sf1_art = pd.read_csv("./datasets/testing/sf1_art.csv",
                          index_col="datekey",
                          parse_dates=["datekey", "calendardate"])
    metadata = pd.read_csv("./datasets/sharadar/SHARADAR_TICKERS_METADATA.csv",
                           index_col="ticker",
                           parse_dates=["firstpricedate"])

    sf1_art["datekey"] = pd.to_datetime(sf1_art["datekey"])
    sf1_art["calendardate"] = pd.to_datetime(sf1_art["calendardate"])

    metadata["firstpricedate"] = pd.to_datetime(metadata["firstpricedate"])

    sep = pandas_mp_engine(callback=add_equally_weighted_weekly_market_returns, atoms=sep, data=None, \
        molecule_key='sep', split_strategy= 'date', \
            num_processes=4, molecules_per_process=1)