Exemplo n.º 1
0
# ARIMA MODEL #3: STAGE 2, MATCH EVENTS ONLY
# LET'S ONLY FOCUS ON STAGE 2...makes more sense to consider match events in the context of the match itself,
#                               and not the buildup or post-match reaction time...
stage_2_df = longform_df >> sift(X.stage_2_ind == 1)
stage_2_df = stage_2_df.reset_index(drop=True)

# new, more thoughtful ARIMA model parameters
# d = 1 ("first difference"); let's predict the delta b/w volumes at consecutive intervals
# aka, "stationarizing" the time series
# q = 1 (a series displays moving average behavior if it apparently undergoes random
#        "shocks" whose effects are felt in 2+ consecutive periods. )
# TODO, diff b/w q = 1 & q = 2?

x_mat = stage_2_df >> select(stage_2_df.home_goal, stage_2_df.away_goal,
                             stage_2_df.home_yellow, stage_2_df.away_yellow,
                             stage_2_df.home_red, stage_2_df.away_red,
                             stage_2_df.competitive_idx)

model = ARIMA(endog=stage_2_df.shorthand_search_vol,
              exog=x_mat,
              dates=stage_2_df.date_time,
              order=(0, 1, 1))

# try .predict() for a couple matches at a time
# might be somewhere in arima where you have to indicate that there's multiple overlapping time series
# these are "uncorrelated" time series

model_fit = model.fit(disp=0)  # disp=0 turns off debug information
with open('model3.txt', 'w') as f:
    # print summary
    print >> f, model_fit.summary()
Exemplo n.º 2
0
def test_ReadPandas_dply():
    filepath = 'tests/data/pandas_table.csv'
    samples = (
        ReadPandas(filepath).dply() >> dp.select(dp.X.col1) >> DplyToList())
    nt.assert_equal(samples, [[1], [2], [3]])
# pip installation
pip install dfply
# conda installation
conda install -c tallic dfply

from dfply import * 

women = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-12-08/women.csv')
df = women

# chaining operations on the data with the >> operator, or alternatively 
# starting with >>= for inplace operations.
# Columns can be specified either by their name (string) or an integer
lowprice = diamonds >> head(10) >> tail(3)

diamonds >> select(X.carat, X.cut) >> head(3)
diamonds >> select(1, X.price, ['x', 'y']) >> head(2)
diamonds >> drop(1, X.price, ['x', 'y']) >> head(2)
diamonds >> select(~X.carat, ~X.color, ~X.clarity) >> head(2) # not operator, returns same as above drop statement

# select filters
diamonds >> select(starts_with('c')) >> head(2)
diamonds >> drop(columns_from(X.price)) >> head(2)
# mixing techniques to select first 2 cols, 'category' col and last 2 cols
diamonds >> select(columns_to(1, inclusive=True), 'depth', columns_from(-2)) >> head(2)
'''
    starts_with(prefix): find columns that start with a string prefix.
    ends_with(suffix): find columns that end with a string suffix.
    contains(substr): find columns that contain a substring in their name.
    everything(): all columns.
    columns_between(start_col, end_col, inclusive=True): find columns between a specified start and end column. The inclusive boolean keyword argument indicates whether the end column should be included or not.
# match(variable, colnames(iris))
# colnames(iris)[colnames(iris) %in% variable]
variable='SepalLength'
variable in iris.columns #True
iris.iloc[:,iris.columns==variable]

# library(dplyr)
import dplython as dp
iris = dp.DplyFrame(iris)
from dplython import (DplyFrame, X, diamonds, select, sift,
  sample_n, sample_frac, head, arrange, mutate, group_by,
  summarize, DelayFunction)
# data(iris)
# data=iris %>% 
# select(Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species)
iris >> dp.select(X.Species) >> dp.head()

iris[['Species', 'PetalLength']]
iris.drop('SepalLength', axis=1) #quitar esa columna
iris.drop(5, axis=0) #quitar la sexta fila
# data=iris %>% 
# filter(Petal.Length>1 & Petal.Length<100)
iris >> dp.sift(X.PetalLength>5)

iris[(iris['PetalLength']>5) & (iris['PetalLength']<6)]
# data=iris %>% 
# dplyr::group_by(Species) %>%
# summarise(media=mean(Petal.Length)) 
iris >> dp.group_by(X.Species) >> dp.summarize(media=X.PetalLength.mean())

iris.groupby(['Species'])['PetalLength'].agg(['mean', 'sum', 'count'])
Exemplo n.º 5
0
bigr = output[output['word'].str.contains("_")]

"""FROM THIS PART, 2 STRATEGIES, SAVE THE OUTPUT AND CONTINUE W R OR GO AHEAD W PYTHON"""




"""5 plotting"""
"""5 1 aggregating for plotting"""
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
    sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) 
dfr = DplyFrame(output)
dfr = (dfr >> 
  group_by(X.word, X.source) >> 
  summarize(tot=X.count.sum()))
dff = (dfr >>select(X.word, X.tot ))

"""5.2 wordcloud"""
"""turns the word freq to dict"""
d = {}
for a, x in dff.values:
    d[a] = x
wordcloud = WordCloud(width = 1000, height = 1000,
                background_color ='white',
                min_font_size =15, max_font_size=120).generate_from_frequencies(frequencies=d)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
Exemplo n.º 6
0
    denominator = np.max(data, 0) - np.min(data, 0)

    return numerator / (denominator + 1e-7)


# train Parameters
seq_length = 60
data_dim = 8
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
iterations = 500

# last, diff_24h, diff_per_24h, bid, ask, low, high, volume
data = DplyFrame(pd.read_csv('./bitcoin_ticker.csv', delimiter=','))
data = data >> sift(X.rpt_key == 'btc_krw') >> select(
    X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume)
data = np.asarray(data)
#data = MinMaxScaler(data)
data = tf.layers.batch_normalization(data)
x = data
y = data[:, [0]]  # last as label

# build a dataset
dataX = []
dataY = []
for i in range(0, len(y) - seq_length):
    _x = x[i:i + seq_length]
    _y = y[i + seq_length]  # Next close price
    print(_x, "->", _y)
    dataX.append(_x)
    dataY.append(_y)
Exemplo n.º 7
0
def czMatchmaker(data, Q, precursor_fasta):
    data = pd.read_csv(
        "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv")
    data = DplyFrame(data)
    precursors = data >> \
     sift( X.tag == 'precursor' ) >> \
     select( X.active, X.neutral, X.estimates)

    fragments = data >> sift( X.tag != 'precursor' ) >> \
     group_by( X.tag, X.active, X.broken_bond ) >> \
     summarize( estimates = X.estimates.sum() )

    I_on_fragments = {}
    optiminfos = {}
    for break_point, data in fragments.groupby('broken_bond'):
        pairing, optiminfo = collect_fragments(data, Q)
        I_on_fragments[break_point] = pairing
        optiminfos[break_point] = optiminfo

    cations_fragmented_I = sum(
        sum(I_on_fragments[bP][p] for p in I_on_fragments[bP])
        for bP in I_on_fragments)

    I_no_reactions = precursors >> \
        sift( X.active==Q, X.neutral == 0) >> \
        select( X.estimates )

    I_no_reactions = I_no_reactions.values.flatten()[0]

    prec_ETnoD_PTR_I = precursors >> \
        sift( X.active != Q ) >> \
        rename( ETnoD  = X.neutral, I = X.estimates ) >> \
        mutate( PTR    = Q - X.ETnoD - X.active ) >> \
        select( X.ETnoD, X.PTR, X.I )

    I_prec_no_frag = prec_ETnoD_PTR_I >> \
        summarize( I = X.I.sum() )

    I_prec_no_frag = I_prec_no_frag.values.flatten()[0]

    precursorNoReactions = precursors >> \
        sift( X.active == Q ) >> \
        select( X.estimates )

    prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate(
            I_PTR  = crossprod(X.PTR, X.I), \
            I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \
        summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() )

    I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten()

    prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag)
    prob_ETnoD = 1. - prob_PTR

    I_frags = dict(
        (bP, sum(I_on_fragments[bP][pairing]
                 for pairing in I_on_fragments[bP])) for bP in I_on_fragments)

    I_frag_total = sum(I_frags[bP] for bP in I_frags)

    prob_frag = Counter(
        dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags))
    prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))]

    I_frags_PTRETnoD_total = sum(
        (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing]
        for bP in I_on_fragments for pairing in I_on_fragments[bP])

    anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag
    prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation
    prob_no_fragmentation = 1 - prob_fragmentation

    prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total +
                                         I_prec_no_frag)
    prob_reaction = 1. - prob_no_reaction

    res = {}
    res['reaction'] = (prob_reaction, prob_no_reaction)
    res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation)
    res['fragmentation_amino_acids'] = tuple(prob_frag)
    return res
Exemplo n.º 8
0
        for i, into_col in enumerate(sp_into):
            df[into_col] = [
                row[i] if len(row) > i else None for row in splitcol
            ]

        columns = list(df.columns)

        reorder_columns = columns[:columns.index(sp_col)] + sp_into + columns[
            (columns.index(sp_col) + 1):-len(into_col) - 1]

        return df[reorder_columns]

    def __rrshift__(self, other):
        return self.__call__(DplyFrame(other.copy(deep=True)))


if __name__ == '__main__':

    mtcars = read_tsv('test/data/mtcars.tsv')
    mtcars = mtcars >> select(X.name, X.mpg, X.cyl)

    d = zip(map(str, mtcars['name']), map(str, mtcars['mpg']),
            map(str, mtcars['cyl']))
    d = ['|'.join(x) for x in d]
    mtcars['name'] = d

    mtcars = mtcars >> select(X.name)
    mtcars_clean = mtcars >> separate(X.name, ['name', 'mpg', 'cyl'], ' ')

    print(mtcars_clean >> head())
import pandas
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
                      sample_frac, head, arrange, mutate, group_by, summarize,
                      DelayFunction)

diamonds >> head(5)

diamonds >> select(X.carat, X.cut, X.price) >> head(5)

d = (diamonds >> sift(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price)
     >> head(2))

(diamonds >> mutate(carat_bin=X.carat.round()) >> group_by(X.cut, X.carat_bin)
 >> summarize(avg_price=X.price.mean()))

test = df['deaths'] < 0
less_than_zero = df[test]
print(less_than_zero.shape)
print(less_than_zero.head())

test

#df['deaths_fixed'] = df['deaths_new'].apply(lambda x: 'True' if x <= 0 else 'False')
Exemplo n.º 10
0
def load_data(input_dir, crsrd_id):
    cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv")
    cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv")

    cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date)
    cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour)
    cctv_log['MINUTE'] = (
        pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30
    cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek
    cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0)  #mon - fri
    cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1)  #sat
    cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2)  #sun
    df0 = DplyFrame(cctv_log) >> group_by(
        X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize(
            GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() +
            X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() +
            X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() +
            X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(),
            LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() +
            X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum())
    # Extract records of selected crossroad
    cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select(
        X.CRSRD_ID, X.CCTV_ID)
    df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID")
    df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID'])

    # Time frame from existing dataset
    tf = DplyFrame(
        df0.drop_duplicates(
            ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select(
                X.DATE, X.DAY, X.HOUR, X.MINUTE)

    # Process the datastructure into pivot
    cctv_list = sorted(cctv_mst['CCTV_ID'].unique())
    df1 = tf

    for cctv in cctv_list:
        a = df0 >> sift(X.CCTV_ID == cctv) >> select(
            X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF)
        df1 = pd.merge(df1,
                       a,
                       how='left',
                       on=['DATE', 'DAY', 'HOUR', 'MINUTE'],
                       suffixes=('', '_' + str(cctv)))

    df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE'])
    df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean())
    df1 = df1.fillna(0)
    df1 = df1.reset_index()

    df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum(
        axis=1, skipna=True))
    df1 = df1 >> sift(X.TOTAL_TRF > 0)
    print(df1)
    # Name the cctv id and direction - for tod_traffic_analyzer

    cols = [cctv + '_GO_RATE' for cctv in cctv_list]
    cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list])
    cols = sorted(cols)
    cols = ['TOD'] + cols + ['TOTAL_TRF']

    return df1, cols
Exemplo n.º 11
0
    for match_id, match_df in large_df_in.groupby(['match_id']):
        x_mat = x_mat_in >> sift(X.match_id == match_id)
        x_mat = x_mat.drop(columns=["match_id"])

        fit_arima_model(df_in=match_df,
                        x_mat_in=x_mat,
                        order_in=(1, 1, 0),
                        coefficients_dict=coefficients,
                        feature_set=feature_set_in,
                        match_id=str(match_id))

        print(match_id)

    json_filename = "arima_" + str(feature_set_in) + ".json"
    export_coefficients(coefficients, json_filename)


if __name__ == "__main__":
    longform_df = DplyFrame(
        pd.read_csv("../../LongForm/longform.csv",
                    dtype={'shorthand_search_vol': float}))

    stage_2_df = process_data(longform_df)
    x_mat = stage_2_df >> select(stage_2_df.match_id, stage_2_df.home_goal,
                                 stage_2_df.away_goal, stage_2_df.home_yellow,
                                 stage_2_df.away_yellow, stage_2_df.home_red,
                                 stage_2_df.away_red)

    # run model with 1st feature set
    run_arima_models(stage_2_df, x_mat, 1)