Пример #1
0
def save_predictions(model, tag, partition):
    r"""Save the predictions to disk.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    preds : numpy array
        The prediction vector.
    probas : numpy array
        The probability vector.

    """

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    separator = model.specs['separator']

    # Get date stamp to record file creation
    timestamp = get_datestamp()

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Read the prediction frame
    file_spec = ''.join([datasets[partition], '*'])
    file_name = most_recent_file(input_dir, file_spec)
    file_name = file_name.split(SSEP)[-1].split(PSEP)[0]
    pf = read_frame(input_dir, file_name, extension, separator)

    # Cull records before the prediction date

    try:
        predict_date = model.specs['predict_date']
        found_pdate = True
    except:
        found_pdate = False

    if found_pdate:
        pd_indices = pf[pf.date >= predict_date].index.tolist()
        pf = pf.iloc[pd_indices]
    else:
        pd_indices = pf.index.tolist()

    # Save predictions for all projects

    logger.info("Saving Predictions")
    output_file = USEP.join(['predictions', timestamp])
    preds = model.preds[(tag, partition)].squeeze()
    if found_pdate:
        preds = np.take(preds, pd_indices)
    pred_series = pd.Series(preds, index=pd_indices)
    df_pred = pd.DataFrame(pred_series, columns=['prediction'])
    write_frame(df_pred, output_dir, output_file, extension, separator)

    # Save probabilities for classification projects

    probas = None
    if model_type == ModelType.classification:
        logger.info("Saving Probabilities")
        output_file = USEP.join(['probabilities', timestamp])
        probas = model.probas[(tag, partition)].squeeze()
        if found_pdate:
            probas = np.take(probas, pd_indices)
        prob_series = pd.Series(probas, index=pd_indices)
        df_prob = pd.DataFrame(prob_series, columns=['probability'])
        write_frame(df_prob, output_dir, output_file, extension, separator)

    # Save ranked predictions

    logger.info("Saving Ranked Predictions")
    pf['prediction'] = pred_series
    if model_type == ModelType.classification:
        pf['probability'] = prob_series
        pf.sort_values('probability', ascending=False, inplace=True)
    else:
        pf.sort_values('prediction', ascending=False, inplace=True)
    output_file = USEP.join(['rankings', timestamp])
    write_frame(pf, output_dir, output_file, extension, separator)

    # Return predictions and any probabilities
    return preds, probas
Пример #2
0
def main(args=None):
    r"""The main program for SportFlow.

    Notes
    -----
    (1) Initialize logging.
    (2) Parse the command line arguments.
    (3) Get the game configuration.
    (4) Get the model configuration.
    (5) Generate game frames for each season.
    (6) Create statistics for each team.
    (7) Merge the team frames into the final model frame.
    (8) Run the AlphaPy pipeline.

    Raises
    ------
    ValueError
        Training date must be before prediction date.

    """

    # Logging

    logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
                        filename="sport_flow.log", filemode='a', level=logging.DEBUG,
                        datefmt='%m/%d/%y %H:%M:%S')
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
                                  datefmt='%m/%d/%y %H:%M:%S')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    logger = logging.getLogger(__name__)

    # Start the pipeline

    logger.info('*'*80)
    logger.info("SportFlow Start")
    logger.info('*'*80)

    # Argument Parsing

    parser = argparse.ArgumentParser(description="SportFlow Parser")
    parser.add_argument('--pdate', dest='predict_date',
                        help="prediction date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_argument('--tdate', dest='train_date',
                        help="training date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument('--predict', dest='predict_mode', action='store_true')
    parser.add_argument('--train', dest='predict_mode', action='store_false')
    parser.set_defaults(predict_mode=False)
    args = parser.parse_args()

    # Set train and predict dates

    if args.train_date:
        train_date = args.train_date
    else:
        train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d")

    if args.predict_date:
        predict_date = args.predict_date
    else:
        predict_date = datetime.date.today().strftime("%Y-%m-%d")

    # Verify that the dates are in sequence.

    if train_date >= predict_date:
        raise ValueError("Training date must be before prediction date")
    else:
        logger.info("Training Date: %s", train_date)
        logger.info("Prediction Date: %s", predict_date)

    # Read game configuration file

    sport_specs = get_sport_config()

    # Section: game

    league = sport_specs['league']
    points_max = sport_specs['points_max']
    points_min = sport_specs['points_min']
    random_scoring = sport_specs['random_scoring']
    seasons = sport_specs['seasons']
    window = sport_specs['rolling_window']   

    # Read model configuration file

    specs = get_model_config()

    # Add command line arguments to model specifications

    specs['predict_mode'] = args.predict_mode
    specs['predict_date'] = args.predict_date
    specs['train_date'] = args.train_date

    # Unpack model arguments

    directory = specs['directory']
    target = specs['target']

    # Create directories if necessary

    output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots']
    for od in output_dirs:
        output_dir = SSEP.join([directory, od])
        if not os.path.exists(output_dir):
            logger.info("Creating directory %s", output_dir)
            os.makedirs(output_dir)

    # Create the game scores space
    space = Space('game', 'scores', '1g')

    #
    # Derived Variables
    #

    series = space.schema
    team1_prefix = 'home'
    team2_prefix = 'away'
    home_team = PSEP.join([team1_prefix, 'team'])
    away_team = PSEP.join([team2_prefix, 'team'])

    #
    # Read in the game frame. This is the feature generation phase.
    #

    logger.info("Reading Game Data")

    data_dir = SSEP.join([directory, 'data'])
    file_base = USEP.join([league, space.subject, space.schema, space.fractal])
    df = read_frame(data_dir, file_base, specs['extension'], specs['separator'])
    logger.info("Total Game Records: %d", df.shape[0])

    #
    # Locate any rows with null values
    #

    null_rows = df.isnull().any(axis=1)
    null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True]
    for i in null_indices:
        logger.info("Null Record: %d on Date: %s", i, df.date[i])

    #
    # Run the game pipeline on a seasonal loop
    #

    if not seasons:
        # run model on all seasons
        seasons = df['season'].unique().tolist()

    #
    # Initialize the final frame
    #

    ff = pd.DataFrame()

    #
    # Iterate through each season of the game frame
    #

    for season in seasons:

        # Generate a frame for each season

        gf = df[df['season'] == season]
        gf = gf.reset_index()

        # Generate derived variables for the game frame

        total_games = gf.shape[0]
        if random_scoring:
            gf['home.score'] = np.random.randint(points_min, points_max, total_games)
            gf['away.score'] = np.random.randint(points_min, points_max, total_games)
        gf['total_points'] = gf['home.score'] + gf['away.score']

        gf = add_features(gf, game_dict, gf.shape[0])
        for index, row in gf.iterrows():
            gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score')
            gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False
            gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False
            gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line']
            gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False
            gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False
            gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under']
            gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False
            gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False

        # Generate each team frame

        team_frames = {}
        teams = gf.groupby([home_team])
        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Generating team frame: %s", team_frame)
            tf = get_team_frame(gf, team, home_team, away_team)
            tf = tf.reset_index()
            tf = generate_team_frame(team, tf, home_team, away_team, window)
            team_frames[team_frame] = tf

        # Create the model frame, initializing the home and away frames

        mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool}
        team1_frame = pd.DataFrame()
        team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix)
        team2_frame = pd.DataFrame()
        team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix)
        frames = [gf, team1_frame, team2_frame]
        mf = pd.concat(frames, axis=1)

        # Loop through each team frame, inserting data into the model frame row
        #     get index+1 [if valid]
        #     determine if team is home or away to get prefix
        #     try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0]
        #     Assign team frame fields to respective model frame fields: set gf.at(pos, field)

        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Merging team frame %s into model frame", team_frame)
            tf = team_frames[team_frame]
            for index in range(0, tf.shape[0]-1):
                gindex = index + 1
                model_row = tf.iloc[gindex]
                key_date = model_row['date']
                at_home = False
                if team == model_row[home_team]:
                    at_home = True
                    key_team = model_row[home_team]
                elif team == model_row[away_team]:
                    key_team = model_row[away_team]
                else:
                    raise KeyError("Team %s not found in Team Frame" % team)            
                try:
                    if at_home:
                        mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0]
                    else:
                        mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0]
                except:
                    raise IndexError("Team/Date Key not found in Model Frame")
                # print team, gindex, mpos
                # insert team data into model row
                mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix)

        # Compute delta data 'home' - 'away'
        mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix)

        # Append this to final frame
        frames = [ff, mf]
        ff = pd.concat(frames)

    # Write out dataframes

    input_dir = SSEP.join([directory, 'input'])
    if args.predict_mode:
        new_predict_frame = ff.loc[ff.date >= predict_date]
        if len(new_predict_frame) <= 1:
            raise ValueError("Prediction frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving prediction frame")
        write_frame(new_predict_frame, input_dir, datasets[Partition.predict],
                    specs['extension'], specs['separator'])
    else:
        # split data into training and test data
        new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)]
        if len(new_train_frame) <= 1:
            raise ValueError("Training frame has length 1 or less")
        new_test_frame = ff.loc[ff.date >= predict_date]
        if len(new_test_frame) <= 1:
            raise ValueError("Testing frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving training frame")
        write_frame(new_train_frame, input_dir, datasets[Partition.train],
                    specs['extension'], specs['separator'])
        logger.info("Saving testing frame")
        write_frame(new_test_frame, input_dir, datasets[Partition.test],
                    specs['extension'], specs['separator'])

    # Create the model from specs

    logger.info("Running Model")
    model = Model(specs)

    # Run the pipeline
    model = main_pipeline(model)

    # Complete the pipeline

    logger.info('*'*80)
    logger.info("SportFlow End")
    logger.info('*'*80)
Пример #3
0
def trade_system(model, system, space, intraday, name, quantity):
    r"""Trade the given system.

    Parameters
    ----------
    model : alphapy.Model
        The model object with specifications.
    system : alphapy.System
        The long/short system to run.
    space : alphapy.Space
        Namespace of instrument prices.
    intraday : bool
        If True, then run an intraday system.
    name : str
        The symbol to trade.
    quantity : float
        The amount of the ``name`` to trade, e.g., number of shares

    Returns
    -------
    tradelist : list
        List of trade entries and exits.

    Other Parameters
    ----------------
    Frame.frames : dict
        All of the data frames containing price data.

    """

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Unpack the system parameters.

    longentry = system.longentry
    shortentry = system.shortentry
    longexit = system.longexit
    shortexit = system.shortexit
    holdperiod = system.holdperiod
    scale = system.scale

    # Determine whether or not this is a model-driven system.

    entries_and_exits = [longentry, shortentry, longexit, shortexit]
    active_signals = [x for x in entries_and_exits if x is not None]
    use_model = False
    for signal in active_signals:
        if any(x in signal for x in ['phigh', 'plow']):
            use_model = True

    # Read in the price frame
    pf = Frame.frames[frame_name(name, space)].df

    # Use model output probabilities as input to the system

    if use_model:
        # get latest probabilities file
        probs_dir = SSEP.join([directory, 'output'])
        file_path = most_recent_file(probs_dir, 'probabilities*')
        file_name = file_path.split(SSEP)[-1].split('.')[0]
        # read the probabilities frame and trim the price frame
        probs_frame = read_frame(probs_dir, file_name, extension, separator)
        pf = pf[-probs_frame.shape[0]:]
        probs_frame.index = pf.index
        probs_frame.columns = ['probability']
        # add probability column to price frame
        pf = pd.concat([pf, probs_frame], axis=1)

    # Evaluate the long and short events in the price frame

    for signal in active_signals:
        vexec(pf, signal)

    # Initialize trading state variables

    inlong = False
    inshort = False
    h = 0
    p = 0
    q = quantity
    tradelist = []

    # Loop through prices and generate trades

    for dt, row in pf.iterrows():
        # get closing price
        c = row['close']
        if intraday:
            bar_number = row['bar_number']
            end_of_day = row['end_of_day']
        # evaluate entry and exit conditions
        lerow = row[longentry] if longentry else None
        serow = row[shortentry] if shortentry else None
        lxrow = row[longexit] if longexit else None
        sxrow = row[shortexit] if shortexit else None
        # process the long and short events
        if lerow:
            if p < 0:
                # short active, so exit short
                tradelist.append((dt, [name, Orders.sx, -p, c]))
                inshort = False
                h = 0
                p = 0
            if p == 0 or scale:
                # go long (again)
                tradelist.append((dt, [name, Orders.le, q, c]))
                inlong = True
                p = p + q
        elif serow:
            if p > 0:
                # long active, so exit long
                tradelist.append((dt, [name, Orders.lx, -p, c]))
                inlong = False
                h = 0
                p = 0
            if p == 0 or scale:
                # go short (again)
                tradelist.append((dt, [name, Orders.se, -q, c]))
                inshort = True
                p = p - q
        # check exit conditions
        if inlong and h > 0 and lxrow:
            # long active, so exit long
            tradelist.append((dt, [name, Orders.lx, -p, c]))
            inlong = False
            h = 0
            p = 0
        if inshort and h > 0 and sxrow:
            # short active, so exit short
            tradelist.append((dt, [name, Orders.sx, -p, c]))
            inshort = False
            h = 0
            p = 0
        # if a holding period was given, then check for exit
        if holdperiod and h >= holdperiod:
            if inlong:
                tradelist.append((dt, [name, Orders.lh, -p, c]))
                inlong = False
            if inshort:
                tradelist.append((dt, [name, Orders.sh, -p, c]))
                inshort = False
            h = 0
            p = 0
        # increment the hold counter
        if inlong or inshort:
            h += 1
            if intraday and end_of_day:
                if inlong:
                    # long active, so exit long
                    tradelist.append((dt, [name, Orders.lx, -p, c]))
                    inlong = False
                if inshort:
                    # short active, so exit short
                    tradelist.append((dt, [name, Orders.sx, -p, c]))
                    inshort = False
                h = 0
                p = 0
    return tradelist
Пример #4
0
def get_data(model, partition):
    r"""Get data for the given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    X : pandas.DataFrame
        The feature set.
    y : pandas.Series
        The array of target values, if available.

    """

    logger.info("Loading Data")

    # Extract the model data

    directory = model.specs['directory']
    extension = model.specs['extension']
    features = model.specs['features']
    model_type = model.specs['model_type']
    separator = model.specs['separator']
    target = model.specs['target']
    test_file = model.test_file
    train_file = model.train_file

    # Read in the file

    filename = datasets[partition]
    input_dir = SSEP.join([directory, 'input'])
    df = read_frame(input_dir, filename, extension, separator)

    # Assign target and drop it if necessary

    y = np.empty([0, 0])
    if target in df.columns:
        logger.info("Found target %s in data frame", target)
        # check if target column has NaN values
        nan_count = df[target].isnull().sum()
        if nan_count > 0:
            logger.info("Found %d records with NaN target values", nan_count)
            logger.info("Labels (y) for %s will not be used", partition)
        else:
            # assign the target column to y
            y = df[target]
            # encode label only for classification
            if model_type == ModelType.classification:
                y = LabelEncoder().fit_transform(y)
            logger.info("Labels (y) found for %s", partition)
        # drop the target from the original frame
        df = df.drop([target], axis=1)
    else:
        logger.info("Target %s not found in %s", target, partition)

    # Extract features

    if features == WILDCARD:
        X = df
    else:
        X = df[features]

    # Labels are returned usually only for training data
    return X, y
Пример #5
0
def get_market_data(model, group, lookback_period, resample_data):
    r"""Get data from an external feed.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.
    group : alphapy.Group
        The group of symbols.
    lookback_period : int
        The number of periods of data to retrieve.

    Returns
    -------
    n_periods : int
        The maximum number of periods actually retrieved.

    """

    # Unpack model specifications

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Unpack group elements

    gspace = group.space
    schema = gspace.schema
    fractal = gspace.fractal

    # Determine the feed source

    if any(substring in fractal for substring in PD_INTRADAY_OFFSETS):
        # intraday data (date and time)
        logger.info("Getting Intraday Data [%s] from %s", fractal, schema)
        intraday_data = True
        index_column = 'datetime'
    else:
        # daily data or higher (date only)
        logger.info("Getting Daily Data [%s] from %s", fractal, schema)
        intraday_data = False
        index_column = 'date'

    # Get the data from the relevant feed

    data_dir = SSEP.join([directory, 'data'])
    pandas_data = any(substring in schema for substring in PD_WEB_DATA_FEEDS)
    n_periods = 0

    for item in group.members:
        logger.info("Getting %s data for last %d days", item, lookback_period)
        # Locate the data source
        if schema == 'data':
            fname = frame_name(item.lower(), gspace)
            df = read_frame(data_dir, fname, extension, separator)
            if not intraday_data:
                df.set_index(pd.DatetimeIndex(df[index_column]),
                             drop=True,
                             inplace=True)
        elif schema == 'google' and intraday_data:
            df = get_google_data(item, lookback_period, fractal)
        elif pandas_data:
            df = get_pandas_data(schema, item, lookback_period)
        else:
            logger.error("Unsupported Data Source: %s", schema)
        # Now that we have content, standardize the data
        if df is not None and not df.empty:
            logger.info("Rows: %d", len(df))
            # standardize column names
            df = df.rename(columns=lambda x: x.lower().replace(' ', ''))
            # add intraday columns if necessary
            if intraday_data:
                df = enhance_intraday_data(df)
            # order by increasing date if necessary
            df = df.sort_index()
            # resample data
            if resample_data:
                df = df.resample(fractal).agg({
                    'open': 'first',
                    'high': 'max',
                    'low': 'min',
                    'close': 'last',
                    'volume': 'sum'
                })
                logger.info("Rows after Resampling at %s: %d", fractal,
                            len(df))
            # allocate global Frame
            newf = Frame(item.lower(), gspace, df)
            if newf is None:
                logger.error("Could not allocate Frame for: %s", item)
            # calculate maximum number of periods
            df_len = len(df)
            if df_len > n_periods:
                n_periods = df_len
        else:
            logger.info("No DataFrame for %s", item)

    # The number of periods actually retrieved
    return n_periods
Пример #6
0
def get_market_data(model, group, lookback_period,
                    data_fractal, intraday_data=False):
    r"""Get data from an external feed.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.
    group : alphapy.Group
        The group of symbols.
    lookback_period : int
        The number of periods of data to retrieve.
    data_fractal : str
        Pandas offset alias.
    intraday_data : bool
        If True, then get intraday data.

    Returns
    -------
    n_periods : int
        The maximum number of periods actually retrieved.

    """

    # Unpack model specifications

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Unpack group elements

    gspace = group.space
    schema = gspace.schema
    fractal = gspace.fractal

    # Determine the feed source

    if intraday_data:
        # intraday data (date and time)
        logger.info("Getting Intraday Data [%s] from %s", data_fractal, schema)
        index_column = 'datetime'
    else:
        # daily data or higher (date only)
        logger.info("Getting Daily Data [%s] from %s", data_fractal, schema)
        index_column = 'date'

    # Get the data from the relevant feed

    data_dir = SSEP.join([directory, 'data'])
    pandas_data = any(substring in schema for substring in PD_WEB_DATA_FEEDS)
    n_periods = 0
    resample_data = True if fractal != data_fractal else False
    df = None
    to_date = pd.to_datetime('today')
    from_date = to_date - pd.to_timedelta(lookback_period, unit='d')

    for item in group.members:
        logger.info("Getting %s data for last %d days", item, lookback_period)
        # Locate the data source
        if schema == 'data':
            # local intraday or daily
            dspace = Space(gspace.subject, gspace.schema, data_fractal)
            fname = frame_name(item.lower(), dspace)
            df = read_frame(data_dir, fname, extension, separator)
        elif schema == 'google' and intraday_data:
            # intraday only
            df = get_google_data(item, lookback_period, data_fractal)
        elif pandas_data:
            # daily only
            df = get_pandas_data(schema, item, lookback_period)
        else:
            logger.error("Unsupported Data Source: %s", schema)
        # Now that we have content, standardize the data
        if df is not None and not df.empty:
            logger.info("%d data points from %s to %s", len(df), from_date, to_date)
            # convert data to canonical form
            df = convert_data(df, index_column, intraday_data)
            # resample data and forward fill any NA values
            if resample_data:
                df = df.resample(fractal).agg({'open'   : 'first',
                                               'high'   : 'max',
                                               'low'    : 'min',
                                               'close'  : 'last',
                                               'volume' : 'sum'})
                df.dropna(axis=0, how='any', inplace=True)
                logger.info("Rows after Resampling at %s: %d",
                            fractal, len(df))
            # add intraday columns if necessary
            if intraday_data:
                df = enhance_intraday_data(df)
            # allocate global Frame
            newf = Frame(item.lower(), gspace, df)
            if newf is None:
                logger.error("Could not allocate Frame for: %s", item)
            # calculate maximum number of periods
            df_len = len(df)
            if df_len > n_periods:
                n_periods = df_len
        else:
            logger.info("No DataFrame for %s", item)

    # The number of periods actually retrieved
    return n_periods
Пример #7
0
def get_data(model, partition):
    r"""Get data for the given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    X : pandas.DataFrame
        The feature set.
    y : pandas.Series
        The array of target values, if available.

    Raises
    ------
    ValueError
        Found test labels with NaN values.

    """

    logger.info("Loading Data")

    # Extract the model data

    directory = model.specs['directory']
    extension = model.specs['extension']
    features = model.specs['features']
    model_type = model.specs['model_type']
    separator = model.specs['separator']
    target = model.specs['target']
    test_file = model.test_file
    train_file = model.train_file

    # Read in the file

    filename = datasets[partition]
    input_dir = SSEP.join([directory, 'input'])
    df = read_frame(input_dir, filename, extension, separator)

    # Assign target and drop it if necessary

    y = np.empty([0, 0])
    if target in df.columns:
        logger.info("Found target %s in data frame", target)
        # drop rows with NaN targets
        original_size = df.shape[0]
        df.dropna(axis=0, subset=[target], inplace=True)
        diff = original_size - df.shape[0]
        if diff > 0:
            raise ValueError("Found %d records in %s with NaN target values" %
                             (diff, partition))
        # assign the target column to y
        y = df[target]
        # encode label only for classification
        if model_type == ModelType.classification:
            y = LabelEncoder().fit_transform(y)
        # drop the target as it has already been extracted into y
        logger.info("Dropping target %s from data frame", target)
        df = df.drop([target], axis=1)
    else:
        logger.info("Target %s not found in %s", target, partition)

    # Extract features

    if features == WILDCARD:
        X = df
    else:
        X = df[features]

    # Labels are returned usually only for training data
    return X, y
Пример #8
0
def get_market_data(model,
                    market_specs,
                    group,
                    lookback_period,
                    intraday_data=False):
    r"""Get data from an external feed.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.
    market_specs : dict
        The specifications for controlling the MarketFlow pipeline.
    group : alphapy.Group
        The group of symbols.
    lookback_period : int
        The number of periods of data to retrieve.
    intraday_data : bool
        If True, then get intraday data.

    Returns
    -------
    n_periods : int
        The maximum number of periods actually retrieved.

    """

    # Unpack market specifications

    data_fractal = market_specs['data_fractal']
    subschema = market_specs['subschema']

    # Unpack model specifications

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Unpack group elements

    gspace = group.space
    schema = gspace.schema
    fractal = gspace.fractal

    # Determine the feed source

    if intraday_data:
        # intraday data (date and time)
        logger.info("%s Intraday Data [%s] for %d periods", schema,
                    data_fractal, lookback_period)
        index_column = 'datetime'
    else:
        # daily data or higher (date only)
        logger.info("%s Daily Data [%s] for %d periods", schema, data_fractal,
                    lookback_period)
        index_column = 'date'

    # Get the data from the relevant feed

    data_dir = SSEP.join([directory, 'data'])
    n_periods = 0
    resample_data = True if fractal != data_fractal else False

    # Date Arithmetic

    to_date = pd.to_datetime('today')
    from_date = to_date - pd.to_timedelta(lookback_period, unit='d')
    to_date = to_date.strftime('%Y-%m-%d')
    from_date = from_date.strftime('%Y-%m-%d')

    # Get the data from the specified data feed

    df = pd.DataFrame()
    for symbol in group.members:
        logger.info("Getting %s data from %s to %s", symbol.upper(), from_date,
                    to_date)
        # Locate the data source
        if schema == 'data':
            # local intraday or daily
            dspace = Space(gspace.subject, gspace.schema, data_fractal)
            fname = frame_name(symbol.lower(), dspace)
            df = read_frame(data_dir, fname, extension, separator)
        elif schema in data_dispatch_table.keys():
            df = data_dispatch_table[schema](schema, subschema, symbol,
                                             intraday_data, data_fractal,
                                             from_date, to_date,
                                             lookback_period)
        else:
            logger.error("Unsupported Data Source: %s", schema)
        # Now that we have content, standardize the data
        if not df.empty:
            logger.info("Rows: %d [%s]", len(df), data_fractal)
            # convert data to canonical form
            df = convert_data(df, index_column, intraday_data)
            # resample data and forward fill any NA values
            if resample_data:
                df = df.resample(fractal).agg({
                    'open': 'first',
                    'high': 'max',
                    'low': 'min',
                    'close': 'last',
                    'volume': 'sum'
                })
                df.dropna(axis=0, how='any', inplace=True)
                logger.info("Rows after Resampling at %s: %d", fractal,
                            len(df))
            # add intraday columns if necessary
            if intraday_data:
                df = enhance_intraday_data(df)
            # allocate global Frame
            newf = Frame(symbol.lower(), gspace, df)
            if newf is None:
                logger.error("Could not allocate Frame for: %s",
                             symbol.upper())
            # calculate maximum number of periods
            df_len = len(df)
            if df_len > n_periods:
                n_periods = df_len
        else:
            logger.info("No DataFrame for %s", symbol.upper())

    # The number of periods actually retrieved
    return n_periods