def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    testpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\test.csv'
    trainpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\train.csv'
    print trainpath
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
    
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv(r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\prediction.csv',
                  index=False)

    return [traindf, testdf]
Пример #2
0
def logsums(name, dir_name):
    # 

    logsum = 'CFULL/SHO'
    logsum_output = 'outputs/grouped/logsums.csv'

    df = pd.read_csv(os.path.join(dir_name, 'aggregate_logsums.1.dat'), delim_whitespace=True, skipinitialspace=True)
    df = df.reset_index()
    df = pd.DataFrame(df[['level_0',logsum]])
    df['source'] = name

    # Separate into accessibility bins
    df['accessibility'] = pd.qcut(df[logsum],5,labels=['lowest','low','moderate','high','highest'])
    bins = pd.qcut(df[logsum],5,retbins=True)[1]

    df.columns = ['taz','logsum','source','accessibility']

    # Attach population
    hh = pd.read_csv(os.path.join(dir_name,'_household.tsv'), sep='\t')
    df_pop = pd.DataFrame(hh.groupby('hhtaz').sum()['hhsize'])
    df_pop['taz'] = df_pop.index
    df = pd.merge(df,df_pop,on='taz',how='left')
    df.columns = [['taz','logsum','source','accessibility','population']]

    # Write to file
    if os.path.exists(logsum_output):
        df_current = pd.read_csv(logsum_output)
        df_current.append(df).to_csv(logsum_output, index=False)
    else:
        df.to_csv(logsum_output, index=False)
Пример #3
0
def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    trainpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtrain.csv'
    testpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtest.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
    
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/prediction.csv',
                  index=False)
    return [traindf, testdf]
Пример #4
0
def slide_14():
    ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    bins = [18, 25, 35, 60, 100]

    cats = pd.cut(ages, bins)
    print cats

    # labels じゃなくて codes を使え
    # print cats.labels
    print cats.codes
    # print cats.levels
    # levels じゃなくて categories を使え
    print cats.categories
    print pd.value_counts(cats)

    print pd.cut(ages, [18, 26, 36, 61, 100], right=False)

    group_names = ['Youth', 'YoungAdultl', 'MiddleAged', 'Senior']
    print pd.cut(ages, bins, labels=group_names)

    data = np.random.rand(20)
    print data
    print pd.cut(data, 3, precision=2)

    data = np.random.randn(1000)
    cats = pd.qcut(data, 3)
    print cats
    print pd.value_counts(cats)
    print pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
Пример #5
0
def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    trainpath = '../../data/train.csv'
    testpath = '../../data/test.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
   
 
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age+jitter(traindf.Age), no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('./prediction.csv', index=False)
    return [traindf, testdf]
Пример #6
0
def create_figure():
    xs = df[x.value].values
    ys = df[y.value].values
    x_title = x.value.title()
    y_title = y.value.title()

    kw = dict()
    if x.value in discrete:
        kw['x_range'] = sorted(set(xs))
    if y.value in discrete:
        kw['y_range'] = sorted(set(ys))
    kw['title'] = "%s vs %s" % (x_title, y_title)

    p = figure(plot_height=600, plot_width=800, tools='pan,box_zoom,reset', **kw)
    p.xaxis.axis_label = x_title
    p.yaxis.axis_label = y_title

    if x.value in discrete:
        p.xaxis.major_label_orientation = pd.np.pi / 4

    sz = 9
    if size.value != 'None':
        groups = pd.qcut(df[size.value].values, len(SIZES))
        sz = [SIZES[xx] for xx in groups.codes]

    c = "#31AADE"
    if color.value != 'None':
        groups = pd.qcut(df[color.value].values, len(COLORS))
        c = [COLORS[xx] for xx in groups.codes]
    p.circle(x=xs, y=ys, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5)

    show(p)
    return p
def preproc_households(store):

    df = store['households']

    df['tenure'] = df.hownrent.map({1: 'own', 2: 'rent'})

    # need to keep track of base year income quartiles for use in the
    # transition model - even caching doesn't work because when you add
    # rows via the transitioning, you automatically clear the cache!
    # this is pretty nasty and unfortunate
    df["base_income_quartile"] = pd.Series(pd.qcut(df.income, 4, labels=False),
                                           index=df.index).add(1)
    df["base_income_octile"] = pd.Series(pd.qcut(df.income, 8, labels=False),
                                         index=df.index).add(1)

    # there are some overrides where we move households around in order
    # to match the city totals - in the future we will resynthesize and this
    # can go away - this csv is generated by scripts/match_city_totals.py
    overrides = pd.read_csv("data/household_building_id_overrides.csv",
                            index_col="household_id").building_id
    df.loc[overrides.index, "building_id"] = overrides.values

    # turns out we need 4 more households
    new_households = df.loc[[1132542, 1306618, 950630, 886585]].reset_index()
    # keep unique index
    new_households.index += pd.Series(df.index).max() + 1
    df = df.append(new_households)

    store['households_preproc'] = df
Пример #8
0
def cleaneddf(no_bins=0):
    trainpath = 'Titanic/train.csv'
    testpath = 'Titanic/test.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)

    #discretise fare
    if no_bins == 0:
       return [cleandf(traindf), cleandf(testdf)]
    traindf = cleandf(traindf)
    testdf = cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins = True)
    bins = bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)

    #discrete age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins = True)
    bins = bins_and_binned_age[1]

    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)

    #create a file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('Titanic/prediction.csv', index = False)
    return [traindf, testdf]
Пример #9
0
    def performBinning(self, x):

        # Assign initial value to entropy and best number of bins
        bestEntropy = 1.0
        best = 0

        for i in bins:
            try:
                data2 = [x, self.df['TARGET']]
                data = pd.concat(data2, axis=1)
                try:
                    data['binned'] = pd.qcut(data.ix[:,0], i, labels=False)

                # In case there is no differenciation
                except:
                    data['binned'] = data.ix[:,0]


                bindf = pd.DataFrame(index=range(round(float(data.shape[0])/(i+1))), columns=range(i))
                bindf = bindf.fillna(0)


                entropyList = []

                total = data.shape[0]


                for j in range(i):

                    sumTarget = data[data['binned']==j].ix[:,1].sum()

                    prob = sumTarget /total

                    # Applying entropy function
                    entropyList.append(self.calculateEntropy(prob))

                totEntropy= 0

                # Calculating total Entropy
                for j in entropyList:
                    totEntropy = totEntropy + (j/len(entropyList))

                # Checking if new entropy is lower than the previous one
                if totEntropy < bestEntropy:
                    print(totEntropy)
                    bestEntropy = totEntropy
                    best = i

                    print(best)

                else:
                    break

            except:
                break

        global binned
        binned[list(data.columns.values)[0]] = (pd.qcut(data.ix[:, 0], best, labels=False))
Пример #10
0
def test_qcut_duplicates_bin(kwargs, msg):
    # see gh-7751
    values = [0, 0, 0, 0, 1, 2, 3]

    if msg is not None:
        with pytest.raises(ValueError, match=msg):
            qcut(values, 3, **kwargs)
    else:
        result = qcut(values, 3, **kwargs)
        expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
        tm.assert_index_equal(result.categories, expected)
Пример #11
0
 def CalcBinReturns(self, dfSignalReturns):
     q = pd.qcut(dfSignalReturns[self.alpha_name], self.bin_num)
         # use qcut to get which day belongs to which bin
     tmp = dfSignalReturns.copy()
     tmp['Bin'] = q.values.codes + 1
     topQ = tmp[tmp['Bin'].apply(lambda x: x >=self.alpha_range_lower and x <= self.alpha_range_higher)].copy()
     qFilter = pd.qcut(topQ[self.filter_name], 100)
     topQ['FilterBin'] = qFilter.values.codes + 1        
     group = topQ.groupby('FilterBin')
     QuantileReturns = group.mean()
     QuantileReturns['RetBps'] = QuantileReturns[self.return_name] * 10000
     QuantileReturns['Labels'] = Series(index=xrange(1,self.filter_bin_num+1), data=qFilter.values.categories)
     return QuantileReturns        
Пример #12
0
def households(store, settings):
    # start with households from urbansim_defaults
    df = datasources.households(store, settings)

    # need to keep track of base year income quartiles for use in the
    # transition model - even caching doesn't work because when you add
    # rows via the transitioning, you automatically clear the cache!
    # this is pretty nasty and unfortunate
    df["base_income_quartile"] = pd.Series(pd.qcut(df.income, 4, labels=False),
                                           index=df.index).add(1)
    df["base_income_octile"] = pd.Series(pd.qcut(df.income, 8, labels=False),
                                         index=df.index).add(1)
    return df
Пример #13
0
def processAge():
    global df
    setMissingAges()

    # center the mean and scale to unit variance
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_scaled'] = scaler.fit_transform(df['Age'])

    # have a feature for children
    df['isChild'] = np.where(df.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df['Age_bin'] = pd.qcut(df['Age'], 4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Age_bin']).rename(columns=lambda x: 'Age_' + str(x))], axis=1)

    if keep_bins:
        df['Age_bin_id'] = pd.factorize(df['Age_bin'])[0]+1

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_bin_id_scaled'] = scaler.fit_transform(df['Age_bin_id'])

    if not keep_strings:
        df.drop('Age_bin', axis=1, inplace=True)
Пример #14
0
def bokeh_choropleth(df):
    """stolen more or less directly from
    http://bokeh.pydata.org/en/0.11.1/docs/gallery/choropleth.html

    """
    states = bksu.data

    # map looks sooooo bad with these included
    for stkey in ['HI', 'AK', 'DC']:
        try:
            del states[stkey]
        except KeyError:
            pass

    state_xs = [d['lons'] for (code, d) in states.items()]
    state_ys = [d['lats'] for (code, d) in states.items()]
    colors = bkpal.Greens9
    colors.reverse()

    state_colors = []
    normcron = (df.corn - df.corn.min()) / df.corn.max()
    stateind = pd.qcut(df.corn, 6).cat.codes
    state_colors = [
        colors[stateind[df.code == statecode].iloc[0]]
        for (statecode, d) in states.items()
    ]
    p = bkp.figure(title='cron', toolbar_location='left', tools=BOKEH_TOOLS)
    p.patches(
        state_xs, state_ys, fill_color=state_colors, fill_alpha=0.7,
        line_color="#884444", line_width=2, line_alpha=0.3
    )

    return bke.components(p)
Пример #15
0
    def cutData(self,var,bins):
        """连续变量离散化1:等数切分数据"""
        q_var = "q_"+var
        
        plot_data = self._data.loc[:,[var]].copy()
        if (len(plot_data[var].value_counts())>20) & (var not in ['addr_state']):#when group >20, catalog data

            bin_acc = bins
            while((q_var in plot_data.columns.tolist())==False):
                try:
                    plot_data[q_var] = pd.qcut(plot_data[var],bin_acc)
                except:
                    #print("can't cut into %s groups" %bin_acc)
                    if bin_acc > 1:
                        bin_acc = bin_acc -1
                        continue
                    else: break


            if(bin_acc==1):
                print("can't cut return uncutted data")
                bins = 1
                plot_data[q_var] = plot_data[var].copy()
            else:
                print("we have cut into %s groups" %bin_acc)
#                print(plot_data[q_var].value_counts())


        else:
            #print("catalog number is lower than 20, we do not re-organize data")
            plot_data[q_var] = plot_data[var].copy()

        return plot_data
Пример #16
0
def processFare():
    global df

    # replace missing values as the median fare. Currently the datasets only contain one missing Fare value
    df['Fare'][ np.isnan(df['Fare']) ] = df['Fare'].median()

    # zero values cause problems with our division interaction variables so set to 1/10th of the lowest fare
    df['Fare'][ np.where(df['Fare']==0)[0] ] = df['Fare'][ df['Fare'].nonzero()[0] ].min() / 10

    # bin into quintiles for binary features
    df['Fare_bin'] = pd.qcut(df['Fare'], 4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1)

    if keep_bins:
        df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]+1

    # center and scale the fare to use as a continuous variable
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'])

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Fare_bin_id_scaled'] = scaler.fit_transform(df['Fare_bin_id'])


    if not keep_strings:
        df.drop('Fare_bin', axis=1, inplace=True)
def plot_sites_by_characteristic(dataframe, lat_col, long_col, title=None, char_column=None, bins=None, dataframe2=None, lat_col2=None, long_col2=None):
    map = Basemap(projection='merc',llcrnrlat=23.5,urcrnrlat=57, llcrnrlon=-140,urcrnrlon=-50,lat_ts=20,resolution='l')
    map.drawcoastlines(linewidth = 1.25)
    plt.title(title)
    
    if not char_column:    
        lats = dataframe[lat_col]
        longs = dataframe[long_col]
        x,y = map(longs.values,lats.values)
        map.plot(x, y, ls='', marker='o', markersize=4)

    if char_column:
        blues = sns.color_palette("Blues", n_colors=bins)
        dataframe['quantile'] = pd.qcut(dataframe[char_column], bins)
        grouped = dataframe.groupby('quantile')
        
        i= -1
        for groupname, groupdata, in grouped:
            i = i + 1
            colors = blues[i]
            lats = groupdata["lat"]
            longs = groupdata["long"]
            x,y = map(longs.values,lats.values)
            map.plot(x, y, ls='', marker='o', color=colors, markersize=4)
    plt.hold(True)
    if lat_col2:    
        lats = dataframe2[lat_col2]
        longs = dataframe2[long_col2]
        x,y = map(longs.values,lats.values)
        map.plot(x, y, ls='', marker='o', markersize=4, color='brown')    
Пример #18
0
def discretize_data(path, data):
    data_aux = [x[13] for x in data]
    data_discrete = pd.qcut(data_aux, 3, labels=False)
    for i, item in enumerate(data):
        data[i][13] = data_discrete[i]
        # print item
    return data
def get_quantiles_summary(cds_cai_dat,num_of_quantiles,R20_vec_compare,vec_cost):
    # we can use this 'qcut' function from pandas to divide our proteins by the quantiles ...
    category,bins = pd.qcut(cds_cai_dat['CAI'],q=num_of_quantiles,retbins=True,labels=False)
    # then we could iterate over proteins/cDNAs in these categories ...
    fivywrel_cat, r20_cat, cost_cat = [],[],[]
    for cat in range(num_of_quantiles):
        cds_cai_category = cds_cai_dat[category==cat]
        protein_length_distro = cds_cai_category['protein'].str.len()
        # average protein length per quantile as a stability measure ...
        average_length = protein_length_distro.mean()
        # total proteins length in quantile for AA freqs calculations ...
        total_length = protein_length_distro.sum()
        IVYWREL = sum(cds_cai_category['protein'].str.count(aa).sum() for aa in list('IVYWREL'))
        # IVYWREL = cds_cai_category['protein'].str.count('|'.join("IVYWREL")).sum() # tiny bit slower ...
        f_IVYWREL = float(IVYWREL)/float(total_length)
        # 20-vector for of amino acid composition ...
        aa_freq_20 = np.true_divide([cds_cai_category['protein'].str.count(aa).sum() for aa in aacids],float(total_length))
        # slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        _1,_2,R20,_4,_5 = stats.linregress(aa_freq_20, R20_vec_compare)
        # Akashi ...
        cost = np.dot(aa_freq_20,vec_cost)
        # storing info ...
        fivywrel_cat.append(f_IVYWREL)
        r20_cat.append(R20)
        cost_cat.append(cost)
    #returning ...
    return (fivywrel_cat,r20_cat,cost_cat)
def run():
    num_average_ticks = 12  # v=['B', 'H', 'S'] p=[0.05, 0.9, 0.05]
    d = pd.DataFrame(DATA[['timestamp', 'last']])
    d['returns'] = compute_returns(d['last'])
    print(d['returns'].head())

    print(d['returns'].rolling(window=2, center=False).mean().head())

    print(d['returns'])

    sr_column = 'sharpe_ratio_{}'.format(num_average_ticks)
    # is to make a forward apply not a backward apply as people usually do.
    d[sr_column] = pd.rolling_apply(d['returns'][::-1],
                                    window=num_average_ticks,
                                    func=sharpe_ratio,
                                    center=False).fillna(0)[::-1]

    print(d.tail(100))

    labels = ['SELL', 'HOLD', 'BUY']
    d['signals'] = pd.qcut(d[sr_column], q=[0, 0.05, 0.95, 1], labels=[0, 1, 2])

    print(d.head(100))
    print(d['signals'].head(100))
    d['signals'].astype(np.float).plot()
    import matplotlib.pyplot as plt
    plt.show()
Пример #21
0
 def test_qcut_nat(self, s):
     # GH 19768
     intervals = IntervalIndex.from_tuples(
         [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
     expected = Series(Categorical(intervals, ordered=True))
     result = qcut(s, 2)
     tm.assert_series_equal(result, expected)
def historical():
    with open(r"capacityFactor.csv", "wb") as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ',
                             quotechar=' ', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(printG)
        for element in content:

            ent = os.listdir("../Data/Production/%s"%element)
            for en in ent:
                print en
                try:
                    data = pd.read_csv("../Data/Production/%s/%s"%(element,en),index_col=0)
                    dato = str(data.columns.values[0])
                    data[data == 0] = None
                    qs, bins = pd.qcut(data,[.25, .5, .75], retbins=True)
                    print bins[0], bins[1],bins[2]
                    dfList = data[dato].tolist()
                    dato0 = min(dfList, key=lambda x:abs(x-bins[0]))
                    dato1 = min(dfList, key=lambda x:abs(x-bins[1]))
                    dato2 = min(dfList, key=lambda x:abs(x-bins[2]))
                    dato0 = data[data[dato] == dato0].index.tolist()
                    dato1 = data[data[dato] == dato1].index.tolist()
                    dato2 = data[data[dato] == dato2].index.tolist()
                    print dato0, dato1, dato2
                    #print pd.Series(bins, index=['Production_25', 'Production_50', 'Production_75'])
                    row = str(en[:len(en)-4])+","+str(bins[0])+","+str(bins[1])+","+str(bins[2])+","+str(dato0[0][:4])+","+str(dato1[0][:4])+","+str(dato2[0][:4])
                    spamwriter.writerow([row])
                except (ValueError, IndexError):
                    pass
Пример #23
0
def show_orders_hist(order_pd, s_list=None, q_default=10):

    if s_list is None:
        s_list = ['lowBkCnt', 'atr_std', 'jump_power', 'diff_days',
                  'wave_score1', 'wave_score2', 'wave_score3',
                  'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd']

    s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list)
    for sn in s_list:
        uq = len(np.unique(order_pd[sn]))
        if uq == 1:
            continue

        bins = 10
        bins = uq // 50 if uq // 50 > bins else bins
        order_pd[sn].hist(bins=bins)
        plt.show()

        try:
            cats = pd.qcut(order_pd[sn], q_default)
        except Exception:
            '''
                某一个数据超出q的数量导致无法分
            '''
            import pandas.core.algorithms as algos
            bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1))
            cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True)
            # ZLog.info(sn + ' qcut except use bins!')
        ZLog.info('{0} show hist and qcuts'.format(sn))
        ZLog.info(cats.value_counts())
def preprocess_damage_types(data, include_qcut_features):
    """Add damage type features for each quartile. Useful for logistic regression."""
    if include_qcut_features:
        for col in [c for c in data.columns if "Damage" in c]:
            data[col + "_qcut5"] = pandas.qcut(data[col], 5)

    return data
Пример #25
0
    def calculatePowerCurveSensitivity(self, dataFrame, power_curve, dataColumn, power_column):
        
        dataFrame['Energy MWh'] = (dataFrame[power_column] * (float(self.timeStepInSeconds) / 3600.)).astype('float')
        
        from collections import OrderedDict
        self.sensitivityLabels = OrderedDict([("V Low","#0000ff"), ("Low","#4400bb"), ("Medium","#880088"), ("High","#bb0044"), ("V High","#ff0000")]) #categories to split data into using data_column and colour to plot
        cutOffForCategories = list(np.arange(0.,1.,1./len(self.sensitivityLabels.keys()))) + [1.]
        
        minCount = len(self.sensitivityLabels.keys()) * 4 #at least 4 data points for each category for a ws bin to be valid
        
        wsBinnedCount = dataFrame[['Wind Speed Bin', dataColumn]].groupby('Wind Speed Bin').count()
        validWsBins = wsBinnedCount.index[wsBinnedCount[dataColumn] > minCount] #ws bins that have enough data for the sensitivity analysis

        dataFrame['Bin'] = np.nan #pre-allocating
        
        for wsBin in dataFrame['Wind Speed Bin'].unique(): #within each wind speed bin, bin again by the categorising by sensCol
            if wsBin in validWsBins:
                try:
                    filt = dataFrame['Wind Speed Bin'] == wsBin
                    dataFrame.loc[filt,'Bin'] = pd.qcut(dataFrame[dataColumn][filt], cutOffForCategories, labels = self.sensitivityLabels.keys())
                except:
                    print "\tCould not categorise data by %s for WS bin %s." % (dataColumn, wsBin)
        
        sensitivityResults = dataFrame[[power_column, 'Energy MWh', 'Wind Speed Bin','Bin']].groupby(['Wind Speed Bin','Bin']).agg({power_column: np.mean, 'Energy MWh': np.sum, 'Wind Speed Bin': len})
        sensitivityResults['Energy Delta MWh'], sensitivityResults['Power Delta kW'] = np.nan, np.nan #pre-allocate
        for i in sensitivityResults.index:
            sensitivityResults.loc[i, 'Power Delta kW'] = sensitivityResults.loc[i, power_column] - power_curve.powerCurveLevels.loc[i[0], power_column]
            sensitivityResults.loc[i, 'Energy Delta MWh'] = sensitivityResults.loc[i, 'Power Delta kW'] * power_curve.powerCurveLevels.loc[i[0], 'Data Count'] * (float(self.timeStepInSeconds) / 3600.)
        
        return sensitivityResults.rename(columns = {'Wind Speed Bin':'Data Count'}), np.abs(sensitivityResults['Energy Delta MWh']).sum() / (power_curve.powerCurveLevels[power_column] * power_curve.powerCurveLevels['Data Count'] * (float(self.timeStepInSeconds) / 3600.)).sum()
def calculateHLStat(obsOutcome, predOutcomeProb):
    
    # Break predicted outcome probabilities into deciles
    predDeciles = pd.qcut(predOutcomeProb, np.arange(0, 1.1, 0.1))
    
    # Pre-allocate
    onesArray = np.nan * np.ones((10,3))
    zerosArray = np.nan * np.zeros((10,3))
    
    # Loop through deciles
    for group in range(10):
        # Observation Counts
        onesCnt = np.sum(obsOutcome[predDeciles.labels == group])
        onesArray[group, 0] = onesCnt
        zerosCnt = np.sum(predDeciles.labels == group) - onesCnt
        zerosArray[group, 0] = zerosCnt
        # Predicted Probabilities
        onesProb = np.sum(predOutcomeProb[predDeciles.labels == group])
        onesArray[group, 1] = onesProb
        zerosProb = np.sum(predDeciles.labels == group) - onesProb
        zerosArray[group, 1] = zerosProb
        # Chi-Squared 
        onesChiSquare = (onesCnt - onesProb) ** 2 / onesProb
        onesArray[group, 2] = onesChiSquare
        zerosChiSquare = (zerosCnt - zerosProb) ** 2 / zerosProb
        zerosArray[group, 2] = zerosChiSquare

    # Chi-Squared Sum and probability
    chiSquareSum = np.sum(onesArray[:, 2]) + np.sum(zerosArray[:, 2])
    chiSquaredof = 8 # dof = g - 2
    chiSquareProb = sm.stats.stattools.stats.chisqprob(chiSquareSum, chiSquaredof)
    
    return chiSquareSum, chiSquareProb  
Пример #27
0
def bin_residuals(resid, var, bins):
    '''
    Compute average residuals within bins of a variable.
    
    Returns a dataframe indexed by the bins, with the bin midpoint,
    the residual average within the bin, and the confidence interval 
    bounds.
    
    ins 
    -- 
    resid, var, bins

    out
    --
    bin DataFrame

    '''
    from pandas import DataFrame, qcut
    import NumPy as np
    # use scipy's binned stat method
    resid_df = DataFrame({'var': var, 'resid': resid})
    resid_df['bins'] = qcut(var, bins)
    bin_group = resid_df.groupby('bins')
    bin_df = bin_group['var', 'resid'].mean()
    bin_df['count'] = bin_group['resid'].count()
    bin_df['lower_ci'] = -2 * (bin_group['resid'].std() / 
                               np.sqrt(bin_group['resid'].count()))
    bin_df['upper_ci'] =  2 * (bin_group['resid'].std() / 
                               np.sqrt(bin_df['count']))
    bin_df = bin_df.sort('var')
    return(bin_df)
Пример #28
0
def discretize(df, columnIndex, cutMode, numberOfBins):
	"""Performs in-place discretization on a numeric column

	The function has two modes of operation: discretization and quantiling, using the :func:`pandas.cut`
	and :func:`pandas.qcut` functions respectively. 

	Args:
		df (pandas.DataFrame): data frame
		columnIndex (int): index of column to discretize
		cutMode (str): 'quantiling' or 'discretization'
		numberOfBins (int): arg passed directly into pandas.cut() and pandas.qcut() functions
	"""

	if (cutMode == "discretization"):
		if type(numberOfBins) is not int:
			numberOfBins = numberOfBins.split(',')
			numberOfBins = map(float, numberOfBins)
		df[df.columns[columnIndex]] = pd.cut(df[df.columns[columnIndex]], numberOfBins).astype(str)
	elif (cutMode == "quantiling"):
		if type(numberOfBins) is not int:
			numberOfBins = numberOfBins.split(',')
			numberOfBins = map(float, numberOfBins)
		df[df.columns[columnIndex]] = pd.qcut(df[df.columns[columnIndex]], numberOfBins).astype(str)
	else:
		return False

	# Replace 'nan' strings with np.nan
	df[df.columns[columnIndex]].replace(to_replace="nan", value=np.nan, inplace=True)
Пример #29
0
def response_deciles (result_tbl):
    
    #requires:
    import pandas as pd
    import numpy as np

    #calculate gains / lift
    bins=10
    
    result=pd.DataFrame(result_tbl,columns=['actual','pred'])
    result['decile']=(bins)-(pd.qcut(result.pred,bins,labels=False))
    grp_dec=result.groupby('decile')
    mean_act_pred=grp_dec['actual','pred'].mean()
    tbl_gains=grp_dec['actual','pred'].agg(['count','sum', 'mean', 'min', 'max']).sort_values([('pred', 'mean')], ascending=False)
    
    l=pd.DataFrame(tbl_gains)
    l_actual=l['actual'].copy().reset_index()
    l_actual=l_actual.drop(['min','max'],axis=1)
    l_actual=l_actual.rename(columns={"mean": "Actual Response Rate","count": "Count","sum": "Responders","decile":"Decile"})
    l_pred=l['pred'].copy().reset_index()
    l_pred=l_pred.drop(['decile','count','sum'],axis=1)
    l_pred=l_pred.rename(columns={"mean": "Predicted Response Rate", "min":"Min Predicted", "max":"Max Predicted"})
    l_comb=pd.concat([l_actual,l_pred],axis=1)

    
    return(l_comb["Actual Response Rate"].values)
Пример #30
0
def test_qcut_include_lowest():
    values = np.arange(10)
    ii = qcut(values, 4)

    ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5),
                               Interval(4.5, 6.75), Interval(6.75, 9)])
    tm.assert_index_equal(ii.categories, ex_levels)
                         cmap=matplotlib.colors.ListedColormap(
                             ((0.4, 0.4, 0.4, 0), (0.4, 0.4, 0.4, 1))),
                         vmin=0,
                         vmax=1,
                         alpha=1.0,
                         zorder=20)

# Plot the T2M
t2m_pc = plot_cube(0.05, -180, 180, -90, 90)
t2m = t2m.regrid(t2m_pc, iris.analysis.Linear())
t2m = quantile_normalise_t2m(t2m)
# Adjust to show the wind
wscale = 200
s = wind_noise_field.data.shape
wind_noise_field.data = qcut(
    wind_noise_field.data.flatten(), wscale, labels=False,
    duplicates='drop').reshape(s) - (wscale - 1) / 2

# Plot as a colour map
wnf = wind_noise_field.regrid(t2m, iris.analysis.Linear())
t2m_img = ax.pcolorfast(lons,
                        lats,
                        t2m.data * 1000 + wnf.data,
                        cmap='RdYlBu_r',
                        alpha=0.8,
                        vmin=-100,
                        vmax=1100,
                        zorder=100)

# PRMSL contours
prmsl_pc = plot_cube(0.25, -180, 180, -90, 90)
Пример #32
0
train['GarageFinish'].isnull().sum()
train['GarageYrBlt'].isnull().sum()
train['GarageQual'].isnull().sum()
train['GarageType'] = train['GarageType'].fillna('NG')
train['GarageCond'] = train['GarageCond'].fillna('NG')
train['GarageFinish'] = train['GarageFinish'].fillna('NG')
train['GarageYrBlt'] = train['GarageYrBlt'].fillna('NG')
train['GarageQual'] = train['GarageQual'].fillna('NG')
train['BsmtExposure'] = train['BsmtExposure'].fillna('NB')
train['BsmtFinType2'] = train['BsmtFinType2'].fillna('NB')
train['BsmtFinType1'] = train['BsmtFinType1'].fillna('NB')
train['BsmtCond'] = train['BsmtCond'].fillna('NB')
train['BsmtQual'] = train['BsmtQual'].fillna('NB')
train['MasVnrType'] = train['MasVnrType'].fillna('none')
train.Electrical = train.Electrical.fillna('SBrkr')
train["LotAreaCut"] = pd.qcut(train.LotArea, 10)
train['LotFrontage'] = train.groupby(
    ['LotAreaCut',
     'Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
train['LotFrontage'] = train.groupby(
    ['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
train.drop("LotAreaCut", axis=1, inplace=True)

#all_columns = train.columns.values
#non_categorical = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1",
#                   "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF",
#                   "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea",
#                   "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
#                   "ScreenPorch","PoolArea", "MiscVal"]
#categorical = [value for value in all_columns if value not in non_categorical]
    data['Embarked'] = data['Embarked'].map({
        'S': 0,
        'C': 1,
        'Q': 2
    }).astype(int)

traindf.head()

# In[ ]:

testdf['Fare'].fillna(testdf['Fare'].dropna().median(), inplace=True)
testdf.head()

# In[ ]:

traindf['FareBand'] = pd.qcut(traindf['Fare'], 4)
traindf[['FareBand', 'Survived'
         ]].groupby(['FareBand'],
                    as_index=False).mean().sort_values(by='FareBand',
                                                       ascending=True)

# In[ ]:

for dataset in combine:
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),
                'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),
                'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
Пример #34
0
def perform_operations(df, col_name, operations):
    """
    Execute operations on a certain column in the dataframe.
        Dtypes                  Operations:      Description:
        ALL                     drop             drop the entire column

        Numerics
                                log              perform log transformation on the column
                                box cox          perform box cox transformation on the column
                                drop0            drop all values with zeros in it
                                absneg           absolute value the negatives
                                median0          replace 0 with the median
                                binning_NUM      create NUM amount of bins
                                outlierZ_NUM     remove outliers with z score > NUM
                                shiftmin         subtract the columns by the minimum value

        Datetime
                                finddays         convert datetime to days since the first day

        Categorical/Object
                                cbinning_NUM     create bins where each bin must have occurences 
                                                 of NUM or higher
                                mostcommon       replace nan with most common category

    :param df: dataframe 
    :type  df: pandas.core.frame.DataFrame
    :param col_name: name of column
    :type  col_name: str
    :param operations: list of operations to perform on the certain column
    :type  operations: list
    :returns: transformed dataframe 
    :rtype:   pandas.core.frame.DataFrame
    """

    col = df[col_name]

    # iterate throughout the list of transformations for each column
    for operation in operations:
        if operation == 'drop':
            # immediately returns the dataframe since no more operations can be
            # performed on a drop column
            return df.drop(col_name, axis=1)

        # numeric columns
        elif str(col.dtype) in {'int8', 'int16', 'int32', 'float64'}:
            if operation == 'log':
                col = np.log(1 + col)  # to make sure no divide by zero
            elif operation == 'box cox':
                col = ss.boxcox(col + 0.001)  # to make sure no divide by zero
            elif operation == "drop0":
                df = df[col != 0]
                col = col[col != 0]
            elif operation == "absneg":
                col = col.abs()
            elif operation == "median0":
                from sklearn.preprocessing import Imputer
                col[col == 0] = np.nan
                imputer = Imputer(strategy="median")
                col = imputer.fit_transform(col.values.reshape(-1, 1))
            elif operation.split('_')[0] == 'binning':
                # name would be binning_NUM
                num = int(operation.split('_')[1])
                quantile_list = [i / (num - 1) for i in range(num)]
                # this column with DROP_ will eventually be dropped.
                # It is here if one were to reference the the bins
                df["DROP_" + col_name] = pd.qcut(
                    col,
                    q=quantile_list,
                    duplicates='raise',
                )
                col = pd.qcut(
                    col,
                    q=quantile_list,
                    duplicates='raise',
                    labels=quantile_list[1:]
                )
            elif operation.split('_')[0] == 'outlierZ':
                z = np.abs(ss.zscore(col))
                keep_values = z < float(operation.split('_')[1])
                df = df[keep_values]
                col = col[keep_values]
            elif operation == "shiftmin":
                col = col - col.min()
            else:
                raise ValueError('Not an available operation for numerics')

        # datetime columns
        elif str(col.dtype) in {'datetime64[ns]'}:
            # TODO: add more datetime dtypes (not sure if that is the only one)
            if operation == "finddays":
                # TODO: should NOT be min, will not generalize to validation/test
                col = (col - min(col)).dt.days

        # categorical or object columns
        elif str(col.dtype) in {'category', 'object'}:
            if operation.split('_')[0] == "cbinning":
                num = float(operation.split('_')[1])
                value_counts = col.value_counts()
                x = col.replace(value_counts)
                df[col_name][df[col_name] == '0'] = np.nan
                df[col_name] = df[col_name].cat.add_categories(['OTHER'])
                df[col_name] = df[col_name].fillna('OTHER')
                df.loc[x < num, col_name] = 'OTHER'
                return df
            elif operation == "mostcommon":
                most_common = col.value_counts().index[0]
                col = col.fillna(most_common)
            else:
                raise ValueError(
                    'Not an available operation for categoricals or objects')

        else:
            raise ValueError('Not an available data type')

    df[col_name] = col

    return df
Пример #35
0
        ax1.set_title("Box plot for all the values", fontsize=20)
        plt.setp(ax1.get_xticklabels(), ha="right", rotation=35)
        plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)
        ax1.boxplot(no_null_col)

        ax1 = fig3.add_subplot(2,3,2)
        ax1.set_title("Distribution of all values", fontsize=20)
        plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15)
        plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)
        ax1.hist(no_null_col)

        ax1 = fig3.add_subplot(2,3,3)
        ax1.set_title("Boxplot for quartiles (all values)", fontsize=20)
        if len(no_null_col.value_counts()) >= 4:
            data[u'quartiles'] = pd.qcut(
                            data[col_name],
                            4, duplicates='drop')
            data.boxplot(column= col_name, by=u'quartiles', ax = ax1)
        plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15)
        plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)

        ax1 = fig3.add_subplot(2,3,4)
        ax1.set_title("Box plot without outliers", fontsize=20)
        plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15)
        plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)
        ax1.boxplot(no_null_col, showfliers=False)

        ax1 = fig3.add_subplot(2,3,5)
        ax1.set_title("Violin plot (<95% percentile)", fontsize=20)
        plt.setp(ax1.get_xticklabels(), ha="right", rotation=35, fontsize=15)
        plt.setp(ax1.get_yticklabels(), ha="right", fontsize=15)
Пример #36
0
def automate_Raking(Data):
    """
    This functions which Data file with [Company Name, Para1,para2 ....., ParaN, Shareprice_Appriciation]
    Get the combinations columsn list...
    Created DataFrames with this combinations.. 
    ## Checking with Multicolinearity with parameters.. Threshold 0.75
    ## Ranking on Parameters   ## Column name==  Parametername + _Rank
    ## Avg_value of Ranking parameters ##Column name =  Avg_Weightage_Rank
    ## ranking on Avg_Weightage_Rank ###Column = Weightages_Avarage_Rank
    
    ## sort_quartiles by Return DataFrame with Quartiles.. 
    
    Returns : 
    my_dfs ==> After multicolinearity all the combinations DataFrames...
    Sorted_dfs ==> Group by Quartiles DataFrames...
    reductions_Dfs ==> Reductions Dfs...
    
    """

    Df = Data
    Df = Df.fillna(0)
    #     Df = Df[Df.iloc[:,-1].replace({0:-1})]
    Df = Df.copy()
    df_list = list(Df.columns)

    fina_ls = []

    for i in range(1, len(df_list[1:])):
        s = rSubset(df_list[2:-1], i)
        combi_list = []
        for j in s:
            combi_list.append(list(j))
        fina_ls.append(combi_list)

    print("Length of Cobmbinations", len(fina_ls))

    ## Created dataframes with all combinations...

    multi_corr = []
    # fina_ls[0]
    for j in fina_ls[:]:
        for i in j[:]:

            i.insert(0, 'Company Name')
            i.insert(1, 'Portfolio')
            i.extend(['Shareprice_Appriciation'])
            df1 = pd.DataFrame(Df[i])
            multi_corr.append(df1)

    #############

    ### Getting Non Multi collinearity commbinations
    All_Dataframes = []
    reductions_Dfs = []
    for i in range(len(multi_corr[:])):
        n = pd.DataFrame(multi_corr[i].iloc[:, :-1].corr()[:] >= 0.75)
        leng = len(n)
        s = n.values
        j = np.eye(leng) == 1
        comparison = s == j
        equal_arrays = comparison.all()
        if equal_arrays == True:
            All_Dataframes.append(multi_corr[i])
        else:
            reductions_Dfs.append(multi_corr[i])
    print('After Multi_Collinearity', len(All_Dataframes))
    print('Reductions ', len(reductions_Dfs))

    #     ## Giving the ranks to features... depends on correlations with Return%...

    for frame in All_Dataframes:
        copied_frame = frame.copy()
        correlation = frame.corr()
        copied_Cor = correlation.copy()
        for j in range(0, len(copied_Cor.columns) - 1):  ## Its -2
            columns = list(copied_Cor.columns)
            #print('Value',columns[j])
            columns_name = columns[j]
            k = len(copied_Cor.columns) - 1
            #print('K Value',k)
            i = j + 2  ## J+ 4 means after from 5th index
            #             if copied_Cor.iloc[j,k] >= 0.05: ## Dont use
            frame[str(columns_name) + '_Rank'] = copied_frame.iloc[:, i].rank(
                method='first', ascending=0)
    my_dfs = All_Dataframes.copy()
    for f in my_dfs:

        L = f.columns.get_loc('Shareprice_Appriciation') + 1
        col = f.iloc[:, L:]
        f['Avg_Rank'] = col.mean(axis=1).round()

    Avg_ranks = my_dfs.copy()

    my_dfs = []
    for new in Avg_ranks:
        i = new.columns.get_loc("Shareprice_Appriciation") + 1
        j = new.columns.get_loc("Avg_Rank")
        l = len(new.columns[i:j])
        z = np.ones(l).tolist()
        for k in range(0, l):
            arrs = z.copy()
            for m in range(1, 6):
                arrs[k] = m

                df1 = new.copy()
                p = df1.columns.get_loc("Shareprice_Appriciation") + 1
                q = df1.columns.get_loc("Avg_Rank")
                cols = list(df1.columns[p:q])
                weightage_list = [
                    '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_',
                    '_Weight_', '_Weight_', '_Weight_', '_Weight_', '_Weight_',
                    '_Weight_'
                ]
                Separater_list = [
                    '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|', '|',
                    '|', '|', '|'
                ]
                w_c = get_columnnames(weightage_list, cols, arrs,
                                      Separater_list)
                #             print(p,q)
                #             print(cols)
                #             print(arrs)
                df1[w_c] = df1.iloc[:, p:q] * arrs
                #             print(df1)
                df = pd.DataFrame(df1)
                #         print(df)
                my_dfs.append(df)
                #print('***')
        #print('#####')

    for frames in my_dfs:
        #     frame = frames.copy()
        i = frames.columns.get_loc("Avg_Rank") + 1
        frames['Weightages_Avarage_Rank'] = frames.iloc[:, i:].mean(
            axis=1).round()

        ##
        j = frames.columns.get_loc("Weightages_Avarage_Rank")
        frames['Weighatages_Rank'] = frames.iloc[:, j].rank(method='first',
                                                            ascending=1)
    ##

    lables_ = []
    for i in range(1, int(np.sqrt(Df.shape[0]).round()) + 1):
        lab = 'Q' + str(i)
        lables_.append(lab)

    Sorted_dfs = []
    for frames in my_dfs:
        #frames["Quartiles"] = pd.qcut(frames['Weightages_Avarage_Rank'].rank(method='first'), int(np.sqrt(frames.shape[0]).round()) , labels=["Q1", "Q2", "Q3","Q4","Q5","Q6","Q7"])
        frames["Quartiles"] = pd.qcut(
            frames['Weighatages_Rank'].rank(method='first'),
            int(np.sqrt(frames.shape[0]).round()),
            labels=lables_)
        Testing_Q = frames.copy()
        Sort_df = sort_quartiles(Testing_Q)
        sortted_q = list(Sort_df.iloc[:, 0])
        # if sortted_q[0] > sortted_q[1] > sortted_q[2] > sortted_q[3]:
        #     print('Yes Falling down...')
        #     Falling_down.append(list(Sort_df.columns))
        # else:
        Sorted_dfs.append(Sort_df)

    return my_dfs, Sorted_dfs, reductions_Dfs
Пример #37
0
                'perfect_pred',
                ascending=False)[diagnostic_cols_to_show].head())
        print("MOVES LEAST LIKELY TO MAKE THE BEST MOVE:")
        print(
            moves_to_test.sort('perfect_pred',
                               ascending=True)[diagnostic_cols_to_show].head())

    else:
        imperfect_moves = moves_to_test[moves_to_test['clipped_movergain'] < 0]
        X = imperfect_moves[features]
        y = imperfect_moves['clipped_movergain']
        pred_y = model.predict(X)
        mask = y < pred_y
        score = float(mask.sum()) / y.shape[0]
        print((
            'imperfect-move error-size quantile model for %s: true quantile is %f'
            % (key, score)))
        combo = concat([Series(y.values), Series(pred_y)], axis=1)
        combo_groups = qcut(combo[1], 10)
        combo_stats = combo.groupby(combo_groups)[0].agg({
            'mean':
            np.mean,
            'q':
            lambda x: np.percentile(x,
                                    float(mg_quant) * 100),
            'count':
            len
        })
        print(("%s distribution of error by prediction range:\n%s" %
               (elo_name, combo_stats)))
Пример #38
0
def add_stats(df):

    df['gp'] = df.apply(active_games, axis=1)
    df['min_3g_avg'] = df.apply(min_3g_avg, axis=1)

    #df['min_7d_avg'] = df.apply(min_avg_7_days, axis=1)
    df['min_90d_avg'] = df.apply(min_avg_90_days, axis=1)
    df['dk_avg_90_days'] = df.apply(dk_avg_90_days, axis=1)
    # df['teampts_avg'] = df.apply(team_pts_90_days, axis=1)
    # df['opppts_avg'] = df.apply(opp_pts_90_days, axis=1)
    df['dk_per_min'] = df['dk_avg_90_days'] / df['min_90d_avg']
    # transform DK points to more normal distro
    df['DKP_trans'] = df['DKP']**.5
    # create columns for - positive DK change; negative DK change
    # df['dk_sal_increase'] = np.where((df['dk_change'] > 0), True, False)
    # df['dk_sal_decrease'] = np.where((df['dk_change'] < 0), True, False)
    # create standard dev and max columns
    df['dk_std_90_days'] = df.apply(dk_std_90_days, axis=1)
    df['dk_max_30_days'] = df.apply(dk_max_30_days, axis=1)
    # get min when starting / bench
    df['min_when_start'] = df.apply(min_when_starting, axis=1)
    df['min_when_bench'] = df.apply(min_when_bench, axis=1)
    # count games started in past week
    df['starts_past_week'] = df.apply(starts_past_week, axis=1)
    # adjust minutes
    df['min_proj'] = df.apply(adjust_minutes, axis=1)
    # add dvp
    df['dvp'] = df.apply(dvp, axis=1)
    # add dvp rank
    df['dvprank'] = pd.qcut(
        df['dvp'], [
            0.05, 0.1, 0.25, 0.5, 0.75, .93, 1], labels=False)
    # combine PACE and dvp
    df['pace_dvp'] = (df['pace_sum'] / 10) + df['dvp']

    # create summary stats
    df['pts'] = df['Stats'].str.extract('(\d*)pt')
    df['rbs'] = df['Stats'].str.extract('(\d*)rb')
    df['stl'] = df['Stats'].str.extract('(\d*)st')
    df['ast'] = df['Stats'].str.extract('(\d*)as')
    df['blk'] = df['Stats'].str.extract('(\d*)bl')
    df['3pm'] = df['Stats'].str.extract('(\d*)trey')
    df['fgm'] = df['Stats'].str.extract('(\d*)-\d*fg')
    df['fga'] = df['Stats'].str.extract('\d*-(\d*)fg')
    df['ftm'] = df['Stats'].str.extract('(\d*)-\d*ft')
    df['fta'] = df['Stats'].str.extract('\d*-(\d*)ft')
    df['tov'] = df['Stats'].str.extract('(\d*)to')
    df[['pts',
        'rbs',
        'stl',
        'ast',
        'blk',
        '3pm',
        'fgm',
        'fga',
        'ftm',
        'fta',
        'tov']] = df[['pts',
                      'rbs',
                      'stl',
                      'ast',
                      'blk',
                      '3pm',
                      'fgm',
                      'fga',
                      'ftm',
                      'fta',
                      'tov']].apply(lambda x: pd.to_numeric(x,
                                                            errors='coerce'))
    df[['pts', 'rbs', 'stl', 'ast', 'blk', '3pm', 'fgm',
        'fga', 'ftm', 'fta', 'tov']].fillna(0, inplace=True)

    # add yesterdays minutes
    df['min_yest'] = df.apply(min_yest, axis=1)
    # create back to back boolean column [over 30 minutes played the prior day]
    df['b2b'] = df.apply(create_b2b_bool, axis=1)

    # fillna just in case
    df['Minutes'] = df['Minutes'].fillna(value=0)
    df['fga'] = df['fga'].fillna(value=0)
    df['fta'] = df['fta'].fillna(value=0)
    df['tov'] = df['tov'].fillna(value=0)

    # add team stats for usage calc
    df['team_mp'] = df.apply(team_mp, axis=1)
    df['team_fga'] = df.apply(team_fga, axis=1)
    df['team_fta'] = df.apply(team_fta, axis=1)
    df['team_tov'] = df.apply(team_tov, axis=1)

    # add individual usage / 3 game rolling avg
    df['usage'] = df.apply(usage, axis=1)
    df['usage_3g_avg'] = df.apply(usage_3g_avg, axis=1)
    df['usage_5g_avg'] = df.apply(usage_5g_avg, axis=1)

    # add value / 3 game rolling avg for val
    df['value'] = df.apply(value, axis=1)
    df['value_3g_avg'] = df.apply(value_3g_avg, axis=1)

    # add starter min - average minutes played of all the starters
    df['starter_min'] = df.apply(starter_min, axis=1)

    # add game by game minutes vs starter average
    df['min_vs_starters'] = df['Minutes'] - df['starter_min']
    df['mvs_5g_avg'] = df.apply(mvs_5g_avg, axis=1)

    # add 3game average of starter minutes
    df['starter_5g_avg'] = df.apply(starter_5g_avg, axis=1)

    # add rolling avg of fga
    df['fga_3g_avg'] = df.apply(fga_3g_avg, axis=1)

    # add double double count
    df['dbl_dbl_cnt'] = df.apply(dbl_dbl, axis=1)
    # create "double double per game" stat
    df['dbl_dbl_per_game'] = df['dbl_dbl_cnt'] / df['gp']
    # combo stat: Minutes + FGA + dbl_dbl_per_game
    df['combo'] = df['min_proj'] + df['dbl_dbl_per_game'] + df['fga_3g_avg']

    return(df)
embarkedmapping = {"S": 1, "C": 2, "Q": 3}
trainingset['Embarked'] = trainingset['Embarked'].map(embarkedmapping)
testingset['Embarked'] = testingset['Embarked'].map(embarkedmapping)
testingset.head()


# In[ ]:


#FILLING MISSING FARE VALUES AND MAPPING THEM INTO NUMERIC VALUES
#MISSING VALUE IS BASED ON THE CLASS OF THE PASSENGER
for x in range(len(testingset["Fare"])):
    if pd.isnull(testingset["Fare"][x]):
        pclass = testingset["Pclass"][x]
        testingset["Fare"][x] = round(trainingset[trainingset["Pclass"] == pclass]["Fare"].mean(), 4)
trainingset['FareBin'] = pd.qcut(trainingset['Fare'], 4, labels = [1, 2, 3, 4])
testingset['FareBin'] = pd.qcut(testingset['Fare'], 4, labels = [1, 2, 3, 4])
trainingset = trainingset.drop(['Fare'], axis = 1)
testingset = testingset.drop(['Fare'], axis = 1)
testingset.head()


# # (6). Algorithm Modelling
# We will now use the training set to test the accuracy of the SVM, RF, KNN and DT algorithms.

# In[ ]:


from sklearn.model_selection import train_test_split
p = trainingset.drop(['Survived', 'PassengerId'], axis=1)
targetset = trainingset["Survived"]
Пример #40
0
train = train.drop("Name",axis=1)
test = test.drop("Name",axis=1)


# In[ ]:


print(test.head())
print(train.head())


# In[ ]:


train['Survived'].groupby(pd.qcut(train['Ticket_len'], 4)).mean()
#train['Ticket_len'].groupby(train['Survived']).mean()


# In[ ]:


X_train = train.drop("Survived",axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId",axis=1).copy()


# In[ ]:


X_train = X_train.drop("PassengerId",axis=1)
Пример #41
0
"""
Created on Sun Mar 15 21:02:40 2020

@author: reocar
"""

# 等距分厢
# 等频分箱
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import datasets
df = pd.DataFrame([[22, 1], [13, 1], [33, 1], [52, 0], [16, 0], [42, 1],
                   [53, 1], [39, 1], [26, 0], [66, 0]],
                  columns=['age', 'Y'])
df['age_bin_2'] = pd.cut(df['age'], 3)  #等距分箱
df['age_bin_1'] = pd.qcut(df['age'], 3)  #等频分箱
display(df)

# k-mean分箱(待修改)
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=666)
kmodel = KMeans(n_clusters=2)  #k为聚成几类
kmodel.fit(X_train[:, 0].reshape(len(X_train[:, 0]), 1))  #训练模型
c = pd.DataFrame(kmodel.cluster_centers_)  #求聚类中心
c = c.sort_values(by=0)  #排序
w = c.rolling(2).mean().iloc[1:]  #用滑动窗口求均值的方法求相邻两项求中点,作为边界点
Пример #42
0
    str) + '_' + data_all['dist1'].astype(str)


# Amt
def Amt_decimal_len(amount):
    split = str(amount).split('.')
    if len(split) > 1:
        return len(split[-1])
    return 0


data_all['Amt_decimal_len'] = data_all['TransactionAmt'].map(Amt_decimal_len)
data_all['Amt_decimal'] = (
    (data_all['TransactionAmt'] - data_all['TransactionAmt'].astype(int)) *
    1000).astype(int)
data_all['Amt_interval'] = pd.qcut(data_all['TransactionAmt'], 20)

cols = [
    'ProductCD', 'card1', 'card2', 'card5', 'card6', 'addr1', 'P_email',
    'R_email'
]
for f in cols:
    data_all[f'Amt_mean_{f}'] = data_all.groupby(
        [f])['TransactionAmt'].transform('mean')
    data_all[f'Amt_std_{f}'] = data_all.groupby(
        [f])['TransactionAmt'].transform('std')
    data_all[f'Amt_pct_{f}'] = (
        data_all['TransactionAmt'] -
        data_all[f'Amt_mean_{f}']) / data_all[f'Amt_std_{f}']
print('Amt cols are done.')
Пример #43
0
    "Teenager": 3,
    "Student": 4,
    "Young Adult": 5,
    "Adult": 6,
    "Senior": 7
}
train['AgeGroup'] = train['AgeGroup'].map(age_mapping)
test['AgeGroup'] = test['AgeGroup'].map(age_mapping)

train = train.drop(['Age'], axis=1)
test = test.drop(['Age'], axis=1)
train.head()

# Fare: 티켓의 요금
# qcut 함수를 사용. 4개의 범위로 cut

train['FareBand'] = pd.qcut(train['Fare'], 4, labels={1, 2, 3, 4})
test['FareBand'] = pd.qcut(test['Fare'], 4, labels={1, 2, 3, 4})
train = train.drop(['Fare'], axis=1)
test = test.drop(['Fare'], axis=1)
train.head()

# *********************
# 데이터 모델링
# **********************
train_data = train.drop('Survived', axis=1)
target = train['Survived']

train_data.shape, target.shape
# ((891, 8), (891,))
Пример #44
0
        "C": 1,
        "Q": 2
    }).astype(int)

#Family Stuff
for dataset in combine:
    dataset["FamilySize"] = dataset["SibSp"] + dataset[
        "Parch"] + 1  #getting family size
for dataset in combine:
    dataset["IsAlone"] = 0
    dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1

#Creating a interval for fare
for dataset in combine:
    dataset["Fare"] = dataset["Fare"].fillna(train["Fare"].median())
    train["CategoricalFare"] = pd.qcut(train["Fare"], 4)
#mapping fare
for dataset in combine:
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),
                'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),
                'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

#looking at the titles in the names
for dataset in combine:
    dataset["Title"] = dataset.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
#print(pd.crosstab(train["Title"], train["Sex"]))
Пример #45
0
def plot_leaflet_network(
        wn,
        node_attribute=None,
        link_attribute=None,
        node_attribute_name='Value',
        link_attribute_name='Value',
        node_size=2,
        node_range=[None, None],
        node_cmap=['cornflowerblue', 'forestgreen', 'gold', 'firebrick'],
        node_cmap_bins='cut',
        node_labels=True,
        link_width=2,
        link_range=[None, None],
        link_cmap=['cornflowerblue', 'forestgreen', 'gold', 'firebrick'],
        link_cmap_bins='cut',
        link_labels=True,
        add_legend=False,
        round_ndigits=2,
        zoom_start=13,
        add_to_node_popup=None,
        add_to_link_popup=None,
        filename='leaflet_network.html'):
    """
    Create an interactive scalable network graphic on a Leaflet map using folium.  

    Parameters
    ----------
    wn : wntr WaterNetworkModel
        A WaterNetworkModel object

    node_attribute : None, str, list, pd.Series, or dict, optional

        - If node_attribute is a string, then a node attribute dictionary is
          created using node_attribute = wn.query_node_attribute(str)
        - If node_attribute is a list, then each node in the list is given a 
          value of 1.
        - If node_attribute is a pd.Series, then it should be in the format
          {nodeid: x} where nodeid is a string and x is a float. 
        - If node_attribute is a dict, then it should be in the format
          {nodeid: x} where nodeid is a string and x is a float

    link_attribute : None, str, list, pd.Series, or dict, optional

        - If link_attribute is a string, then a link attribute dictionary is
          created using edge_attribute = wn.query_link_attribute(str)
        - If link_attribute is a list, then each link in the list is given a 
          value of 1.
        - If link_attribute is a pd.Series, then it should be in the format
          {linkid: x} where linkid is a string and x is a float. 
        - If link_attribute is a dict, then it should be in the format
          {linkid: x} where linkid is a string and x is a float.

    node_attribute_name : str, optional 
        The node attribute name, which is used in the node popup and node legend
        
    link_attribute_name : str, optional 
        The link attribute name, which is used in the link popup and link legend
        
    node_size : int, optional
        Node size 

    node_range : list, optional
        Node range ([None,None] indicates autoscale)

    node_cmap : list of color names, optional
        Node colors 
    
    node_cmap_bins: string, optional
        Node color bins, 'cut' or 'qcut'
    
    node_labels: bool, optional
        If True, the graph will include each node labelled with its name. 
        
    link_width : int, optional
        Link width

    link_range : list, optional
        Link range ([None,None] indicates autoscale)

    link_cmap : list of color names, optional
        Link colors
    
    link_cmap_bins: string, optional
        Link color bins, 'cut' or 'qcut'
        
    link_labels: bool, optional
        If True, the graph will include each link labelled with its name. 
    
    add_legend: bool, optional
         Add a legend to the map
    
    round_ndigits : int, optional
        Rounds digits in the popup
        
    zoom_start : int, optional
        Zoom start used to set initial scale of the map
    
    add_to_node_popup : None or pd.DataFrame, optional
        To add additional information to the node popup, use a DataFrame with 
        node name as index and attributes as values.  Column names will be added
        to the popup along with each value for a given node.
        
    add_to_link_popup : None or pd.DataFrame, optional
        To add additional information to the link popup, use a DataFrame with 
        link name as index and attributes as values.  Column names will be added
        to the popup along with each value for a given link.
        
    filename : str, optional
        Filename used to save the map
    """

    if folium is None:
        raise ImportError('folium is required')

    if node_attribute is not None:
        if isinstance(node_attribute, list):
            node_cmap = ['red']
        node_attribute = _format_node_attribute(node_attribute, wn)
        node_attribute = pd.Series(node_attribute)
        if node_range[0] is not None:
            node_attribute[node_attribute < node_range[0]] = node_range[0]
        if node_range[1] is not None:
            node_attribute[node_attribute > node_range[1]] = node_range[1]
        if node_cmap_bins == 'cut':
            node_colors, node_bins = pd.cut(node_attribute,
                                            len(node_cmap),
                                            labels=node_cmap,
                                            retbins=True)
        elif node_cmap_bins == 'qcut':
            node_colors, node_bins = pd.qcut(node_attribute,
                                             len(node_cmap),
                                             labels=node_cmap,
                                             retbins=True)

    if link_attribute is not None:
        if isinstance(link_attribute, list):
            link_cmap = ['red']
        link_attribute = _format_link_attribute(link_attribute, wn)
        link_attribute = pd.Series(link_attribute)
        if link_range[0] is not None:
            link_attribute[link_attribute < link_range[0]] = link_range[0]
        if link_range[1] is not None:
            link_attribute[link_attribute > link_range[1]] = link_range[1]
        if link_cmap_bins == 'cut':
            link_colors, link_bins = pd.cut(link_attribute,
                                            len(link_cmap),
                                            labels=link_cmap,
                                            retbins=True)
        elif link_cmap_bins == 'qcut':
            link_colors, link_bins = pd.qcut(link_attribute,
                                             len(link_cmap),
                                             labels=link_cmap,
                                             retbins=True)

    G = wn.get_graph()
    pos = nx.get_node_attributes(G, 'pos')
    center = pd.DataFrame(pos).mean(axis=1)

    m = folium.Map(location=[center.iloc[1], center.iloc[0]],
                   zoom_start=zoom_start,
                   tiles='cartodbpositron')
    #folium.TileLayer('cartodbpositron').add_to(m)

    # Node popup
    node_popup = {k: '' for k in wn.node_name_list}
    if node_labels:
        for name, node in wn.nodes():
            node_popup[name] = node.node_type + ': ' + name
            if node_attribute is not None:
                if name in node_attribute.index:
                    node_popup[name] = node_popup[name] + '<br>' + \
                        node_attribute_name + ': ' + '{:.{prec}f}'.format(node_attribute[name], prec=round_ndigits)
            if add_to_node_popup is not None:
                if name in add_to_node_popup.index:
                    for key, val in add_to_node_popup.loc[name].iteritems():
                        node_popup[name] = node_popup[name] + '<br>' + \
                            key + ': ' + '{:.{prec}f}'.format(val, prec=round_ndigits)

    # Link popup
    link_popup = {k: '' for k in wn.link_name_list}
    if link_labels:
        for name, link in wn.links():
            link_popup[name] = link.link_type + ': ' + name
            if link_attribute is not None:
                if name in link_attribute.index:
                    link_popup[name] = link_popup[name] + '<br>' + \
                        link_attribute_name + ': ' + '{:.{prec}f}'.format(link_attribute[name], prec=round_ndigits)
            if add_to_link_popup is not None:
                if name in add_to_link_popup.index:
                    for key, val in add_to_link_popup.loc[name].iteritems():
                        link_popup[name] = link_popup[name] + '<br>' + \
                            key + ': ' + '{:.{prec}f}'.format(val, prec=round_ndigits)

    if node_size > 0:
        for name, node in wn.nodes():
            loc = (node.coordinates[1], node.coordinates[0])
            radius = node_size
            color = 'black'
            if node_labels:
                popup = node_popup[name]
            else:
                popup = None

            if node_attribute is not None:
                if name in node_attribute.index:
                    color = node_colors[name]
                else:
                    radius = 0.1

            folium.CircleMarker(loc,
                                popup=popup,
                                color=color,
                                fill=True,
                                fill_color=color,
                                radius=radius,
                                fill_opacity=0.7,
                                opacity=0.7).add_to(m)

    if link_width > 0:
        for name, link in wn.links():
            start_loc = (link.start_node.coordinates[1],
                         link.start_node.coordinates[0])
            end_loc = (link.end_node.coordinates[1],
                       link.end_node.coordinates[0])
            weight = link_width
            color = 'black'
            if link_labels:
                popup = link_popup[name]
            else:
                popup = None

            if link_attribute is not None:
                if name in link_attribute.index:
                    color = link_colors[name]
                else:
                    weight = 1.5

            folium.PolyLine([start_loc, end_loc],
                            popup=popup,
                            color=color,
                            weight=weight,
                            opacity=0.7).add_to(m)

    if (add_legend) & ((len(node_cmap) >= 1) or (len(link_cmap) >= 1)):
        if node_attribute is not None:  #Produce node legend
            height = 50 + len(node_cmap) * 20 + (
                int(len(node_attribute_name) / 20) + 1) * 20
            node_legend_html = """<div style="position: fixed; 
        bottom: 50px; left: 50px; width: 150px; height: """ + str(
                height) + """px; 
        background-color:white;z-index:9999; font-size:14px; "><br>
            <b><P ALIGN=CENTER>""" + "Node Legend: " + node_attribute_name + """</b> </P>"""
            for color, val in zip(node_cmap, node_bins[0:-1]):
                val = '{:.{prec}f}'.format(val, prec=round_ndigits)
                node_legend_html += """
                &emsp;<i class="fa fa-circle fa-1x" 
                style="color:""" + color + """ "></i> >= """ + val + """ <br>"""
            node_legend_html += """</div>"""
            m.get_root().html.add_child(folium.Element(node_legend_html))

        if link_attribute is not None:  #Produce link legend
            height = 50 + len(link_cmap) * 20 + (
                int(len(link_attribute_name) / 20) + 1) * 20
            link_legend_html = """<div style="position: fixed; 
			bottom: 50px; left: 250px; width: 150px; height: """ + str(height) + """px; 
			background-color:white;z-index:9999; font-size:14px; "><br>
            <b><P ALIGN=CENTER>""" + "Link Legend: " + link_attribute_name + """</b> </P>"""
            for color, val in zip(link_cmap, link_bins[0:-1]):
                val = '{:.{prec}f}'.format(val, prec=round_ndigits)
                link_legend_html += """
               &emsp;<i class="fa fa-minus fa-1x" 
                style="color:""" + color + """ "></i> >= """ + val + """ <br>"""
            link_legend_html += """</div>"""
            m.get_root().html.add_child(folium.Element(link_legend_html))

    #plugins.Search(points, search_zoom=20, ).add_to(m)
    #if add_longlat_popup:
    #    m.add_child(folium.LatLngPopup())

    folium.LayerControl().add_to(m)

    m.save(filename)
Пример #46
0
def get_Data(**kw):
    kwargs = {"varName":None,
            "args":None,
             "qcut":0,
              "qType":"norm",
              "orth":False
              }
    #kwargs["orth"] = True
    kwargs.update(kw)
    data = briefstats.data
    data.loc[:,"vwap"] = briefstats.get_vwap(1).values.reshape(-1)
    X = pd.DataFrame()
    _varName = kwargs["varName"]
    if not isinstance(_varName,list):
        _varName = [_varName,]
    filename = get_hash(_varName, kwargs["args"])
    try:
        __col = np.load('data/col{}.npy'.format(filename))
        __index = np.load("data/index{}.npy".format(filename))
        __values = np.load("data/values{}.npy".format(filename))
        X = pd.DataFrame(__values,columns=__col,index=__index)
        return X
    except:
        pass
    mabp = (data["askPrc"] + data["bidPrc"]) / 2
    mabpD = mabp.diff(1)
    mabpD.iloc[0] = 0
    _ratio = ((data["askQty"] * data["askPrc"] - data["bidQty"] * data["bidPrc"]) / (data["askQty"] * data["askPrc"] + data["bidQty"] * data["bidPrc"])).values.reshape(-1)

    qwap = (data["askPrc"]*data["askQty"] + data["bidPrc"]*data["bidQty"]) / (data["askQty"]+data["bidQty"])
    qwapD = qwap.diff(1)
    qwapD.iloc[0] = 0

    def args(v,default=1,lb=None):
        return get_args(v,kwargs['args'],default,lb=lb)

    for varName  in _varName:
        try:
            print -1,varName
            X.loc[:,varName] = data[varName].values.reshape(-1)
            #logging.debug(tmp)
            continue
            # return tmp
        except:
            try:
                _vwap = re.match("vwap",varName).span()
                if _vwap is not None:
                    tmp = varName[_vwap[1]:]
                    if tmp == "D":
                        X.loc[:,"vwapD"] = data["vwap"].diff(1).values.reshape(-1)
                        X.iloc[0, -1] = 0
                        continue
                        # return X
                    if tmp == "Log":
                        X.loc[:,"vwapLog"] = np.log(data["vwap"]).diff(1).values.reshape(-1)
                        X.iloc[0, -1] = 0
                        continue
                        # return X
                    if tmp == "":
                        X.loc[:,"vwap"] = briefstats.get_vwap(1).values.reshape(-1)
                        continue
                    if tmp == "DEWM":
                        vwap = data["vwap"].diff(1)
                        vwap[0] = 0
                        vwapEwm = vwap.ewm(com=1).mean()
                        X.loc[:,"vwapDEWM"] = vwapEwm.values
                        # return X
            except:
                logging.debug("X data do not have vwap")
            try:
                _last = re.match("last", varName).span()
                if _last is not None:
                    logging.debug(["_last is not none",_last])
                    tmp = varName[_last[1]:]
                    logging.debug(tmp)
                    if tmp == "D":
                        X.loc[:, "lastD"] = data["last"].diff(1).values.reshape(-1)
                        X.iloc[0, -1] = 0
                        continue
                        # return X
                    if tmp == "Log":
                        X.loc[:, "lastLog"] = np.log(data["last"]).diff(1).values.reshape(-1)
                        X.iloc[0, -1] = 0
                        continue
                        # return X
            except:
                logging.debug("X data do not have last")
            ####  mabp
            if varName == "mabp":
                try:
                    X.loc[:,"mabp"] = mabp.values.reshape(-1)
                except:
                    X.loc[:, "mabp"] = mabp.reshape(-1)
                continue
            if varName ==  "mabpEWM":
                for _com in args(varName,1):
                    X.loc[:,"mabpEWM{}".format(_com)] = pd.DataFrame(mabp).ewm(com=_com).mean().values.reshape(-1)
                continue
            if varName == "mabpD":
                for _window in args(varName,1,lb=1):
                    mabpDw = mabpD.rolling(window=_window).sum()
                    mabpDw.iloc[:_window] = mabpD.values.reshape(-1)[:_window].cumsum()
                    X.loc[:,"mabpD{}".format(_window)] = mabpDw.values.reshape(-1)
                #print X
                #logging.error(X)
                continue
            if varName == "mabpDEWM":
                for _com in args(varName, 1):
                    mabpDw = mabpD.ewm(com=_com).mean()
                    X.loc[:,"mabpDEWM{}".format(_com)] = mabpDw.values.reshape(-1)
                continue
            #### qwap
            if varName == "qwap":
                X.loc[:,"qwap"] = qwap.values.reshape(-1)
                continue
            if varName == "qwapD":
                X.loc[:,"qwapD"] = qwapD.values.reshape(-1)
                continue
            if varName == "qwapEWM":
                tmp = qwap.ewm(com=1).mean().values.reshape(-1)
                X.loc[:,"qwapDEWM"] =  tmp
                continue
            if varName == "qwapDEWM":
                tmp = qwapD.ewm(com=1).mean().values.reshape(-1)
                X.loc[:, "qwapDEWM"] = tmp
                continue

            if varName == "askDaskbidQty":
                X.loc[:, "askDaskbidQty"] = data["askQty"].values / (data["bidQty"].values + data["askQty"].values)
                continue
                # return X
            if varName == "askDaskbidQtyEWM":
                for _com in args(varName,1):
                    askQty = data["askQty"].ewm(com=_com).mean()
                    bidQty = data["bidQty"].ewm(com=_com).mean()
                    X.loc[:, "askDaskbidQty{}".format(_com) ] = askQty.values / (bidQty.values + askQty.values)
                continue
                # return X
            if varName == "askDaskbidQtyR":
                for _window in args(varName,2,lb=1):
                    askQty = data["askQty"].rolling(window=_window).sum()
                    askQty[:_window] = data["askQty"].values[:_window].cumsum()
                    bidQty = data["bidQty"].rolling(window=_window).sum()
                    bidQty[:_window] = data["bidQty"].values[:_window].cumsum()
                    X.loc[:,"askDaskbidQtyR{}".format(_window)] = (askQty.values/(bidQty.values + askQty.values)).reshape(-1)
            # if varName == "ratio":
            #     openInterestD = data["openInterest"].diff(1)
            #     openInterestD.iloc[0] = 0
            #     X.loc[:, "ratio"] = (openInterestD.values / (data["volumeD"].values + (data["volumeD"].values == 0))).reshape(-1)
            #     continue
            #     # return X
            # if varName == "ratioL":
            #     openInterestD = data["openInterest"].diff(1)
            #     openInterestD.iloc[0] = 0
            #     ratio = openInterestD.values/(data["volumeD"].values+(data["volumeD"].values==0))
            #     logging.error(pd.isna(ratio).sum())
            #     X.loc[:,"ratioL"] = (pd.cut(ratio, bins=[-1.1, -0.75, -0.25, 0.25, 0.75, 1], labels=False) - 3).reshape(-1)
            #     logging.error(pd.isna(X).sum())
            #     continue
            if varName == "fundSpread":
                fundSpread = data["askQty"]*data["askPrc"]-data["bidQty"]*data["bidPrc"]
                for _window in args(varName,1,lb=1):
                    tmp = fundSpread.rolling(window=_window).sum()
                    tmp[:_window] = fundSpread[:_window].cumsum()
                    X.loc[:,"fundSpread{}".format(_window)] = tmp.values.reshape(-1)
                continue
            if varName == "fundSpreadEWM":
                for _com in args(varName,1):
                    ask = pd.DataFrame(data["askQty"] * data["askPrc"]).ewm(com=_com).mean()
                    bid = pd.DataFrame(data["bidQty"] * data["bidPrc"]).ewm(com=_com).mean()
                    X.loc[:, "fundSpreadEWM{}".format(_com)] = (ask - bid).values.reshape(-1)
                continue
            if varName == "askbidDtotalRatio":
                X.loc[:,"askbidDtotalRatio"] = ((data["askQty"]*data["askPrc"]-data["bidQty"]*data["bidPrc"])/(data["askQty"]*data["askPrc"]+data["bidQty"]*data["bidPrc"])).values.reshape(-1)
                continue
            if varName == "askbidDtotalRatioR":
                ask = data["askQty"] * data["askPrc"]
                bid = data["bidQty"] * data["bidPrc"]
                for _window in args(varName,8,lb=1):
                    _window = int(_window)
                    tmpask = ask.rolling(window= _window).sum()
                    tmpask[:_window] = ask[:_window].cumsum()
                    tmpbid = bid.rolling(window=_window).sum()
                    tmpbid[:_window] = bid[:_window].cumsum()
                    X.loc[:,"askbidDtotalRatioR{}".format(_window)]= ((tmpask.values - tmpbid.values)/(tmpask.values + tmpbid.values)).reshape(-1)
                continue

            if varName == "askbidDtotalRatioEWM":
                for com in args(varName,0.1):
                    #com = com/10
                    ask = data["askQty"] * data["askPrc"]
                    ask = ask.ewm(com=com).mean()
                    bid = data["bidQty"] * data["bidPrc"]
                    bid = bid.ewm(com=com).mean()
                    X.loc[:, "askbidDtotalRatioEWM{}".format(com)] = (
                                (ask.values - bid.values) / (ask.values + bid.values)).reshape(-1)
                continue
            # if varName == "askbidDturnover": #wuxiao
            #     D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values))
            #     tmp = ((data["bidQty"]*data["bidPrc"]+data["askQty"]*data["askPrc"])/data["turnoverD"]*10)*D
            #     tmp[np.isinf(tmp)] = np.nan
            #     tmp.fillna(0,inplace=True)
            #     X.loc[:,"askbidDturnover"] = tmp.values.reshape(-1)
            #     #print X
            #     logging.debug(["x nan",pd.isna(X["askbidDturnover"]).sum()])
            #     continue

            if varName == "sov":
                D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values))
                for _window in args(varName,1,lb=1):
                    obv = pd.Series(D * data["volumeD"]).rolling(window=_window).sum()
                    obv[:_window] = (D * data["volumeD"])[:_window].cumsum()
                    logging.debug(obv.values.reshape(-1))
                    X.loc[:,"sov{}".format(_window)] = obv.values.reshape(-1)
                    logging.debug(X)
                continue
            if varName == "sovEWM":
                D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values))
                for _com in args(varName, 1):
                    obvD = pd.DataFrame(D * data["volumeD"]).ewm(com = _com).mean()
                    X.loc[:, "sovEWM{}".format(_com)] = obvD.values.reshape(-1)
                    logging.debug(obvD.values.reshape(-1))
                continue
            if varName == "soo":
                D = np.array(map(lambda x: 1 if x != 0 else 0, mabpD.values))
                openInterestD = data["openInterest"].diff(1)
                for _window in args(varName, 60):
                    openInterestD.iloc[0] = 0
                    soo = pd.DataFrame(D * openInterestD).rolling(window=_window).sum()
                    soo.iloc[:_window,0] = (D * openInterestD).values[:_window].cumsum().reshape(-1)
                    X.loc[:, "soo{}".format(_window)] = soo.values.reshape(-1)
                continue
            if varName == "sooEWM":
                D = np.array(map(lambda x: 1 if x != 0 else 0, mabpD.values))
                openInterestD = data["openInterest"].diff(1)
                openInterestD.iloc[0] = 0
                for _com in args(varName, 1):
                    oboD = pd.DataFrame(D * openInterestD).ewm(com = _com).mean()
                    X.loc[:, "sooEWM{}".format(_com)] = oboD.values.reshape(-1)
                continue
            if varName == "signUpDown":
                D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values))
                for _window in args(varName,2,lb=1):
                    tmp = pd.DataFrame(D).rolling(window=_window).sum()
                    tmp.iloc[:_window,0] = D[:_window].cumsum()
                    if True:
                        tmp = pd.qcut(tmp.values.reshape(-1), 10, duplicates='drop', labels=False)
                        tmp = tmp * (10 / tmp.max())
                        X.loc[:, "signUpDown{}".format(_window)] = tmp.reshape(-1)
                    else:
                        X.loc[:,"signUpDown{}".format(_window)] = tmp.values.reshape(-1)
                continue
            if varName == "signUpDownL":
                D = np.array(map(lambda x: 1 if x > 0 else (0 if x == 0 else -1), mabpD.values))
                tmp = np.full(shape=(len(D),),fill_value=0.0)
                _t = 0
                for i in xrange(len(tmp)):
                    tmp[i] = _t
                    if D[i] > 0:
                        if _t > 0:
                            _t += 1
                        else:
                            _t = 1
                    elif D[i] < 0:
                        if _t < 0:
                            _t -= 1
                        else:
                            _t = -1
                    else:
                        _t = 0
                X.loc[:,"signUpDownL"] = tmp.reshape(-1)
                continue
            if varName == "midDvwap":
                for _diff in args(varName,5,lb=1):
                    vwap = briefstats.get_vwap(_diff)
                    X.loc[:,"midDvwap{}".format(_diff)] = mabp.values - vwap.values
                    continue
            if varName == "qwapDvwap":
                qwap = briefstats.get_qwap()
                for _diff in args(varName,5,lb=1):
                    vwap = briefstats.get_vwap(_diff)

                    X.loc[:, "qwapDvwap{}".format(_diff)] = qwap.values - vwap.values
                #print X
                continue

            if varName == "rsv":
                for _window in args(varName,8,lb=2):
                    X.loc[:,'rsv{}'.format(_window)]=get_rsv(mabp,window=_window).values.reshape(-1)
                continue
            if varName == "rsvEWM":
                for _com in args(varName,0.5):
                    rsv = get_rsv(mabp)
                    rsvEWM = rsv.ewm(_com)
                    X.loc[:,"rsvEWM{}".format(_com)] = rsvEWM.mean()
                continue
            if varName == "rsvEWM":
                for _com in args():
                    pass
    if kwargs["qcut"]>0:
        _columns = X.columns
        if kwargs["qType"] == 'rank':
            for _col in _columns:
                tmp = pd.qcut(X.loc[:,_col],kwargs["qcut"],duplicates='drop',labels=False)
                tmp = tmp*(kwargs["qcut"]/tmp.max())
                X.loc[:,_col] = tmp.values.reshape(-1)
        elif kwargs["qType"] in {'mid','left','right'}:
            for _col in _columns:
                tmp = pd.qcut(X.loc[:,_col],kwargs["qcut"],duplicates='drop').apply(lambda x: getattr(x, kwargs["qType"])).pipe(np.asarray)
                X.loc[:,_col] = tmp.reshape(-1)
        else:
            pass
    if kwargs["orth"]:
        _columns = X.columns
        for _col in _columns:
            tmp = orth(X.loc[:,_col],_ratio)
            try:
                X.loc[:, _col] = tmp.values.reshape(-1)
            except:
                X.loc[:, _col] = tmp.reshape(-1)

    filename = get_hash(_varName, kwargs["args"])
    try:
        np.save('data/col{}.npy'.format(filename),X.columns)
        np.save("data/index{}.npy".format(filename),X.index)
        np.save("data/values{}.npy".format(filename),X.values)
        logging.debug("save sucessed")
    except:
        logging.debug("save failed")
        pass
    return X
Пример #47
0
    dataset.loc[(dataset["Age"] > 29) & (dataset["Age"] <= 39), "Age"] = 3
    dataset.loc[(dataset["Age"] > 29) & (dataset["Age"] <= 39), "Age"] = 3
    dataset.loc[dataset["Age"] > 39, "Age"] = 4
sns.countplot(x="Age", data=train, hue="Survived")

# In[24]:

## Boxplot for Fare
sns.boxplot(x=train["Survived"], y=train["Fare"])

# The skewness of Fare feature is significantly high. Thus, I discretized the number of bin size based on the third quartile value; if the last bin starts with the third quartile value when bin size = n, then n number of bin will be selected.

# In[25]:

## discretize Fare
pd.qcut(train["Fare"], 8).value_counts()

# In[26]:

for dataset in total:
    dataset.loc[dataset["Fare"] <= 7.75, "Fare"] = 0
    dataset.loc[(dataset["Fare"] > 7.75) & (dataset["Fare"] <= 7.91),
                "Fare"] = 1
    dataset.loc[(dataset["Fare"] > 7.91) & (dataset["Fare"] <= 9.841),
                "Fare"] = 2
    dataset.loc[(dataset["Fare"] > 9.841) & (dataset["Fare"] <= 14.454),
                "Fare"] = 3
    dataset.loc[(dataset["Fare"] > 14.454) & (dataset["Fare"] <= 24.479),
                "Fare"] = 4
    dataset.loc[(dataset["Fare"] > 24.479) & (dataset["Fare"] <= 31),
                "Fare"] = 5
def prepare_ranges(plot_df,groupby):
    if groupby == 'K_value':
        # ranges = K_value_ranges
        # plot_df.loc[:, 'group_range'] = pd.cut(
        #     plot_df[groupby], ranges).astype(str)
        # plot_df.loc[plot_df[groupby] > ranges[-1],
        #             'group_range'] = '>{}'.format(ranges[-1])
        # plot_df.loc[plot_df[groupby] == ranges[0],
        #             'group_range'] = ' {}'.format(ranges[0])
        # plot_df.loc[plot_df[groupby] < ranges[0],
        #             'group_range'] = '<{}'.format(ranges[0])
        # qcutted = pd.qcut(plot_df[plot_df[groupby]<1][groupby], 9,duplicates='drop')
        # categories = qcutted.cat.categories
        # qcutted_str = qcutted.astype(str)
        # qcutted_str[qcutted_str == str(categories[0])] = '(0, {}]'.format(categories[0].right)
        # qcutted_str[qcutted_str == str(categories[-1])] = '({}, 1)'.format(categories[-1].left)
        # plot_df.loc[plot_df[groupby]<1, 'group_range'] = qcutted_str
        # plot_df.loc[plot_df[groupby]>=1, 'group_range'] = '>= 1'
        ranges = condition_number_ranges
        cutted = pd.cut(plot_df[groupby], ranges,include_lowest=True)
        categories = cutted.cat.categories
        plot_df.loc[:, 'group_range'] = cutted.astype(str)
        plot_df.loc[plot_df[groupby] > ranges[-1],
                    'group_range'] = '>{}'.format(ranges[-1])
        plot_df.loc[plot_df['group_range'] == str(categories[0]),'group_range'] = '[{},{}]'.format(ranges[0],categories[0].right)
        # plot_df.loc[plot_df[groupby] < ranges[0],
        #             'group_range'] = '<{}'.format(ranges[0])
        def custom_sort(col):
            vals = []
            for val in col.tolist():
                if ',' in val:
                    vals.append(float(val.split(',')[1][1:-1]))
                else:
                    # vals.append(float(val[2:]))
                    vals.append(float('inf'))
            return pd.Series(vals)
        return categories,None
    elif groupby in ['isoform_length']:
        def custom_sort(col):
            vals = []
            for val in col.tolist():
                if ',' in str(val):
                    vals.append(float(val.split(',')[1][1:-1]))
                else:
                    # vals.append(float(val[1:]))
                    vals.append(float('inf'))
            return pd.Series(vals)
        plot_df[groupby] = plot_df[groupby].astype(int)
       
        if plot_df[groupby].max() > 3000:
            max_threshold = 4000
            lower, higher = int(plot_df[groupby].min()), 4000
            step_size = 400
        else:
            max_threshold = 2100
            lower, higher = int(plot_df[groupby].min()), 2100
            step_size = 200
        # # max_threshold = np.ceil(np.percentile(plot_df[groupby], 80))
        # # lower, higher = int(plot_df.min()), int(plot_df.max())
        # # step_size = int(math.ceil((higher - lower)/n_bins))
        n_bins = 10
        
        edges = [lower] + list(
            range(step_size, higher+1, step_size))
        cutted,categories = pd.cut(
            plot_df.loc[plot_df[groupby] <= max_threshold, groupby], bins=edges,include_lowest=True,retbins=True)
        return categories,max_threshold
    elif groupby in ['num_exons','num_isoforms']:
        def custom_sort(col):
            vals = []
            for val in col.tolist():
                if ',' in val:
                    vals.append(float(val.split(',')[1][1:-1]))
                else:
                    # vals.append(float(val[1:]))
                    vals.append(float('inf'))
            return pd.Series(vals)
        if groupby == 'num_exons':
            ranges = num_exons_range
        else:
            ranges = num_isoforms_range
        cutted = pd.cut(
        plot_df[groupby], ranges, right=False)
        categories = cutted.cat.categories
        plot_df.loc[:, 'group_range'] = cutted.apply(lambda x:str(x)).astype(str)
        plot_df.loc[plot_df[groupby] >= ranges[-1],
                    'group_range'] = '>={}'.format(ranges[-1])
        plot_df.loc[plot_df['group_range'] == str(
            categories[0]), 'group_range'] = '[{}, {})'.format(int(ranges[0]), int(categories[0].right))
        return categories, ranges[-1]
    else:
        plot_df[groupby] = plot_df[groupby].astype(int)
        max_threshold = np.ceil(np.percentile(plot_df[groupby], 90))
        if (len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())<10):
            n_bins = len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())
        else:
            n_bins = 10
        qcutted,categories = pd.qcut(plot_df.loc[plot_df[groupby] <= max_threshold, groupby], n_bins,labels=False,duplicates='drop',retbins=True)
        # lower, higher = temp_df.min(), temp_df.max()
        # if (len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())<10):
        #     n_bins = len(plot_df.loc[plot_df[groupby] <= max_threshold, groupby].unique())
        # else:
        #     n_bins = 10
        # edges = list(
        #     range(int(lower-1), int(higher), int(math.ceil((higher - lower)/n_bins))))
        # edges.append(higher)
        # plot_df.loc[plot_df[groupby] <= max_threshold, 'group_range'] = pd.cut(
        #     temp_df, bins=edges).astype('str')
        # plot_df.loc[plot_df[groupby] > max_threshold,
        #             'group_range'] = '>{}'.format(max_threshold)
        return categories,max_threshold
Пример #49
0
 def fare_ordinal(this) -> object:
     this.train['FareBand'] = pd.qcut(this['Fare'], 4, labels={1, 2, 3, 4})
     this.test['FareBand'] = pd.qcut(this['Fare'], 4, labels={1, 2, 3, 4})
     return this
Пример #50
0
#metr = c("age","fare")
metr = ["age","fare"]
#summary(df[metr]) 
df[metr].describe()




# Create nominal variables for all metric variables (for linear models) before imputing -------------------------------

#metr_binned = paste0(metr,"_BINNED_")
metr_binned = [x + "_BINNED_" for x in metr]
#df[metr_binned] = map(df[metr], ~ {
#  cut(., unique(quantile(., seq(0,1,0.1), na.rm = TRUE)), include.lowest = TRUE)
#})
df[metr_binned] = df[metr].apply(lambda x: pd.qcut(x, 10).astype(object))
df[metr_binned].describe()

# Convert missings to own level ("(Missing)")
#df[metr_binned] = map(df[metr_binned], ~ fct_explicit_na(., na_level = "(Missing)"))
df[metr_binned] = df[metr_binned].fillna("(missing)")
#summary(df[metr_binned],11)
df[metr_binned].describe()
{print(df[x].value_counts()[:11]) for x in metr_binned}



# Handling missings ----------------------------------------------------------------------------------------------

# Remove covariates with too many missings from metr 
#misspct = map_dbl(df[metr], ~ round(sum(is.na(.)/nrow(df)), 3)) #misssing percentage
ax1.set_title('   ')
prob2=stats.probplot(data['x2'],dist=stats.norm,plot=ax2)
ax2.set_xlabel('')
ax2.set_title('   ')
prob3=stats.probplot(data['x3'],dist=stats.norm,plot=ax3)
ax3.set_xlabel('')
ax3.set_title('   ')

#2.连续型数据分箱(无监督型和有监督型)
#无监督型:等宽 + 等频 + 聚类
#(1)固定宽度分箱
newdata=np.floor_divide(data,k) #除以k进行分箱
newdata=np.floor(np.log10(data))  #通过对数函数映射到指数宽度分箱
#(2)分位数分箱
df=data.quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])  #十分位数分箱
pd.qcut(data,4,labels=False) #四分位数分箱,并返回分箱序号
data=pd.Series(data)
data.quantile([0.25,0.5,0.75])
#(3)聚类分箱


#有监督型:卡方分箱法、ID3-C4.5-CART等单变量决策树算法、信用评分建模的IV最大化分箱
#(1)卡方分箱法

#(2)基于CART的决策树分箱(每个叶子节点的样本量>=总样本量的5%;内部节点再划分所需的最小样本数>=总样本量的10%)
import pandas as pd
import numpy as np
sample_set=pd.read_csv('data')
def calc_score_median(sample_set,var):
    '''
    计算相邻评分的中位数,以便进行决策树二元切分
Пример #52
0
# In[7]:

data = data[data['engagement'] > 0]
print(data.shape)
data.head()

# In[9]:

# Now lets check the descriptive stats
data.describe()

# In[10]:

data['engagement_bucket'] = pd.qcut(data['engagement'],
                                    q=[0, 0.5, 0.75, 1],
                                    labels=['Low', 'Medium', 'High'])
data.head()

# In[11]:

# sns.countplot(x='engagement_bucket', data=data)
# plt.show()

# In[12]:

# Creating time related features such as time, day, etc.
data['day'] = data['published'].dt.day
data['hour'] = data['published'].dt.hour
data['week_day'] = data['published'].dt.weekday
    #complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)

ds_train.info()

#Delete unwanted columns
drop_column = ['PassengerId', 'Cabin', 'Ticket']
ds_train.drop(drop_column, axis=1, inplace=True)
ds_test.drop(drop_column, axis=1, inplace=True)

for dataset in X_pack:
    #Discrete variables
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['Title'] = dataset['Name'].str.split(
        ", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset['FareBin'] = pd.qcut(
        dataset['Fare'], 4, labels=['cheap', 'medium', 'high', 'expensive'])
    dataset['AgeBin'] = pd.cut(
        dataset['Age'].astype(int),
        5,
        labels=['kid', 'young_adult', 'adult', 'mature', 'old'])
#cleanup rare title names
stat_min = 10
title_names = (ds_train['Title'].value_counts() < stat_min
               )  # True/False separation

ds_train['Title'] = ds_train['Title'].apply(
    lambda x: 'Unique' if title_names.loc[x] == True else x)
print(ds_train['Title'].value_counts())

#define y variable aka target/outcome
Target = ['Survived']
Пример #54
0
    def bin_score_deciles(self, fname='Deciles', bar=True, line=True, ylabel_bar='Probability',
                          xlabel_bar='Score Deciles', ylabel_line='True', xlabel_line='Predicted',
                          opacity=0.8, title='Deciles', plot_format='.pdf'):

        """This method is used to plot the performance of the predicted scores of the model verses the true label
        based on the deciles of the predictions. First the prediction scores are divided into 10 ranges (deciles),
        the mean of the true label and the prediction scores are calculated for each range and plotted

        :param fname: str
            The name of the file under which the plot is stored

        :param bar: bool
            Whether a bar plot is requested for the deciles or not

        :param line: bool
            Whether a line plot is requested for the deciles or not

        :param ylabel_bar: str
            It defines what to be written on the Y-axis of the bar plot

        :param xlabel_bar: str
            It defines what to be written on the X-axis of the bar plot

        :param ylabel_line: str
            It defines what to be written on the Y-axis of the line plot

        :param xlabel_line: str
            It defines what to be written on the X-axis of the line plot

        :param opacity: float
            The degree of the opacity of the bar plot

        :param title: str
            The title of the plot

        :param plot_format: str
            This defines the format used to save the plot '.png', '.jpg', '.pdf'

        :return None, It saves the requested plot on disk
        """

        df = pd.DataFrame({'CHURN_SCORE': self.pred_score, 'TRUE_SCORE': self.true_label})
        deciles = pd.qcut(df['CHURN_SCORE'], 10, duplicates='drop')
        df['SCORE_GROUP'] = deciles.values.codes
        df_graph = df.groupby(['SCORE_GROUP'])['CHURN_SCORE', 'TRUE_SCORE'].mean().reset_index()

        if bar:
            fig, ax = plt.subplots(figsize=(5, 5))
            ax = df_graph.plot(x='SCORE_GROUP', y='CHURN_SCORE', kind='bar', ax=ax, legend=False,
                               color=Config.colors['RED'], label='Predicted', alpha=opacity)
            ax = df_graph.plot(x='SCORE_GROUP', y='TRUE_SCORE', kind='bar', ax=ax, legend=False,
                               color=Config.colors['YEL'], label='True', alpha=opacity)
            ax.set_ylabel(ylabel_bar)
            ax.set_xlabel(xlabel_bar)
            ax.set_title(title + ' (bar)', fontsize=Config.TIT_FS, fontweight='bold')
            ax.legend(loc="best")
            fig.savefig(os.path.join(self.viz_dir, fname + '_bar' + plot_format),
                        bbox_inches='tight')
            plt.close()

        if line:
            fig, ax = plt.subplots(figsize=(5, 5))
            ax = df_graph.plot(x='CHURN_SCORE', y='TRUE_SCORE', ax=ax, legend=False,
                               color=Config.colors['RED'])
            ax.set_xlabel(xlabel_line)
            ax.set_ylabel(ylabel_line)
            ax.set_title(title + ' (line)', fontsize=Config.TIT_FS, fontweight='bold')
            fig.savefig(os.path.join(self.viz_dir, fname + '_line' + plot_format),
                        bbox_inches='tight')
            plt.close()
Пример #55
0
def main():
    '''Creates example_signal_upload.csv to upload for validation and live data submission'''
    napi = numerapi.SignalsAPI()

    # read in list of active Signals tickers which can change slightly era to era
    eligible_tickers = pd.Series(napi.ticker_universe(),
                                 name='bloomberg_ticker')
    print(f"Number of eligible tickers: {len(eligible_tickers)}")

    # read in yahoo to bloomberg ticker map, still a work in progress, h/t wsouza
    ticker_map = pd.read_csv(
        'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv'
    )
    print(f"Number of tickers in map: {len(ticker_map)}")

    # map eligible numerai tickers to yahoo finance tickers
    yfinance_tickers = eligible_tickers.map(
        dict(zip(ticker_map['bloomberg_ticker'],
                 ticker_map['yahoo']))).dropna()
    bloomberg_tickers = ticker_map['bloomberg_ticker']
    print(f'Number of eligible, mapped tickers: {len(yfinance_tickers)}')

    # download data
    n = 1000  # chunk row size
    chunk_df = [
        yfinance_tickers.iloc[i:i + n]
        for i in range(0, len(yfinance_tickers), n)
    ]

    concat_dfs = []
    print("Downloading data...")
    for df in chunk_df:
        try:
            # set threads = True for faster performance, but tickers will fail, script may hang
            # set threads = False for slower performance, but more tickers will succeed
            temp_df = yfinance.download(df.str.cat(sep=' '),
                                        start='2005-12-01',
                                        threads=False)
            temp_df = temp_df['Adj Close'].stack().reset_index()
            concat_dfs.append(temp_df)
        except:  # simplejson.errors.JSONDecodeError:
            pass

    full_data = pd.concat(concat_dfs)

    # properly position and clean raw data, after taking adjusted close only
    full_data.columns = ['date', 'ticker', 'price']
    full_data.set_index('date', inplace=True)
    # convert yahoo finance tickers back to numerai tickers
    full_data['bloomberg_ticker'] = full_data.ticker.map(
        dict(zip(ticker_map['yahoo'], bloomberg_tickers)))
    print('Data downloaded.')
    print(
        f"Number of tickers with data: {len(full_data.bloomberg_ticker.unique())}"
    )

    ticker_groups = full_data.groupby('ticker')
    full_data['RSI'] = ticker_groups['price'].transform(lambda x: RSI(x))

    # group by era (date) and create quintile labels within each era, useful for learning relative ranking
    date_groups = full_data.groupby(full_data.index)
    full_data['RSI_quintile'] = date_groups['RSI'].transform(
        lambda group: pd.qcut(group, 5, labels=False, duplicates='drop'))
    full_data.dropna(inplace=True)

    # create lagged features grouped by ticker
    ticker_groups = full_data.groupby('ticker')
    num_days = 5
    # lag 0 is that day's value, lag 1 is yesterday's value, etc
    for day in range(num_days + 1):
        full_data[f'RSI_quintile_lag_{day}'] = ticker_groups[
            'RSI_quintile'].transform(lambda group: group.shift(day))

    # create difference of the lagged features and absolute difference of the lagged features (change in RSI quintile by day)
    for day in range(num_days):
        full_data[f'RSI_diff_{day}'] = full_data[
            f'RSI_quintile_lag_{day}'] - full_data[
                f'RSI_quintile_lag_{day + 1}']
        full_data[f'RSI_abs_diff_{day}'] = np.abs(
            full_data[f'RSI_quintile_lag_{day}'] -
            full_data[f'RSI_quintile_lag_{day + 1}'])

    # define column names of features, target, and prediction
    feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [
        f'RSI_diff_{num}' for num in range(num_days)
    ] + [f'RSI_abs_diff_{num}' for num in range(num_days)]
    print(f'Features for training:\n {feature_names}')

    TARGET_NAME = 'target'
    PREDICTION_NAME = 'signal'

    # read in Signals targets
    targets = pd.read_csv('historical_targets.csv')
    targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d')

    # merge our feature data with Numerai targets
    ML_data = pd.merge(full_data.reset_index(),
                       targets,
                       on=['date', 'bloomberg_ticker']).set_index('date')
    # print(f'Number of eras in data: {len(ML_data.index.unique())}')

    # for training and testing we want clean, complete data only
    ML_data.dropna(inplace=True)
    ML_data = ML_data[ML_data.index.weekday ==
                      4]  # ensure we have only fridays
    ML_data = ML_data[ML_data.index.value_counts() >
                      50]  # drop eras with under 50 observations per era

    # train test split
    train_data = ML_data[ML_data['data_type'] == 'train']
    test_data = ML_data[ML_data['data_type'] == 'validation']

    # train model
    print("Training model...")
    model = GradientBoostingRegressor(subsample=0.1)
    model.fit(train_data[feature_names], train_data[TARGET_NAME])
    print("Model trained.")

    # predict test data
    test_data[PREDICTION_NAME] = model.predict(test_data[feature_names])

    # predict live data
    # choose data as of most recent friday
    last_friday = datetime.now() + relativedelta(weekday=FR(-1))
    date_string = last_friday.strftime('%Y-%m-%d')

    try:
        live_data = full_data.loc[date_string].copy()
    except KeyError as e:
        print(f"No ticker on {e}")
        live_data = full_data.iloc[:0].copy()
    live_data.dropna(subset=feature_names, inplace=True)

    # get data from the day before, for markets that were closed
    # on the most recent friday
    last_thursday = last_friday - timedelta(days=1)
    thursday_date_string = last_thursday.strftime('%Y-%m-%d')
    thursday_data = full_data.loc[thursday_date_string]
    # Only select tickers than aren't already present in live_data
    thursday_data = thursday_data[~thursday_data.ticker.isin(live_data.ticker.
                                                             values)].copy()
    thursday_data.dropna(subset=feature_names, inplace=True)

    live_data = pd.concat([live_data, thursday_data])

    print(f"Number of live tickers to submit: {len(live_data)}")
    live_data[PREDICTION_NAME] = model.predict(live_data[feature_names])

    # prepare and writeout example file
    diagnostic_df = pd.concat([test_data, live_data])
    diagnostic_df['friday_date'] = diagnostic_df.friday_date.fillna(
        last_friday.strftime('%Y%m%d')).astype(int)
    diagnostic_df['data_type'] = diagnostic_df.data_type.fillna('live')
    diagnostic_df[['bloomberg_ticker', 'friday_date', 'data_type',
                   'signal']].reset_index(drop=True).to_csv(
                       'example_signal_upload.csv', index=False)
    print(
        'Example submission completed. Upload to signals.numer.ai for scores and live submission'
    )
Пример #56
0
def main():
    train_df = pd.read_csv('data_files/train.csv')
    test_df = pd.read_csv('data_files/test.csv')
    combine = [train_df, test_df]
    # print('{}'.format(train_df.columns.values))
    # print('{}'.format(test_df.columns.values))
    # print('{}'.format(train_df.head()))
    # print('{}'.format(train_df.tail()))
    # print('*' * 40)
    train_df.info()
    # print('*'*40)
    # test_df.info()
    # print('*' * 40)
    # print('{}'.format(train_df.describe(percentiles=[.61, .62])))
    print('{}'.format(train_df.describe(include=['O'])))
    # print('{}'.format(
    #     train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
    #         .sort_values(by='Survived', ascending=False)
    # ))
    # print('{}'.format(
    #     train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean()
    #         .sort_values(by='Survived', ascending=False)
    # ))
    # print('{}'.format(
    #     train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean()
    #         .sort_values(by='Survived', ascending=False)
    # ))

    # print('{}'.format(
    #     train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean()
    #         .sort_values(by='Survived', ascending=False)
    # ))
    train_df_age = train_df[["Age", "Survived"]]
    train_df_age['Age'] = train_df_age['Age'].apply(np.round)
    print('{}'.format(train_df_age[["Age", "Survived"]].groupby(
        ['Age'], as_index=False).mean().sort_values(by='Age', ascending=True)))
    # g = sns.FacetGrid(train_df, col='Survived')
    # g.map(plt.hist, 'Age', bins=40)
    #
    # grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
    # # grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
    # grid.map(plt.hist, 'Age', alpha=.8, bins=20)
    # grid.add_legend()
    #
    # # grid = sns.FacetGrid(train_df, col='Embarked')
    # grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
    # grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
    # grid.add_legend()

    # grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
    # # grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
    # grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
    # grid.add_legend()

    # plt.show()

    #     lets do come cleanup of data
    print('Data before cleanup: {} {} {} {}'.format(train_df.shape,
                                                    test_df.shape,
                                                    combine[0].shape,
                                                    combine[1].shape))
    train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
    combine = [train_df, test_df]
    print(
        'Data after cleanup: train shape: {} test shape: {} combine shapes:{} {}'
        .format(train_df.shape, test_df.shape, combine[0].shape,
                combine[1].shape))

    # extracting titles from names and replacement
    for data_set in combine:
        data_set['Title'] = data_set.Name.str.extract(' ([A-Za-z]+)\.',
                                                      expand=False)
    # print('{}'.format(pd.crosstab(train_df['Title'], train_df['Sex'])))

    for data_set in combine:
        data_set['Title'] = data_set['Title'].replace([
            'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev',
            'Sir', 'Jonkheer', 'Dona'
        ], 'Rare')
        data_set['Title'] = data_set['Title'].replace(['Mlle', 'Ms'], 'Miss')
        data_set['Title'] = data_set['Title'].replace('Mme', 'Mrs')

    # print('{}'.format(pd.crosstab(train_df['Title'], train_df['Sex'])))

    # print('{}'.format(train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()))
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    for data_set in combine:
        data_set['Title'] = data_set['Title'].map(title_mapping)
        data_set['Title'] = data_set['Title'].fillna(0)

    # print('{}'.format(combine[0].head()))

    train_df.drop(train_df[['Name', 'PassengerId']], axis=1, inplace=True)
    test_df.drop(test_df[['Name']], axis=1, inplace=True)

    # print('{}'.format(combine[0].head()))
    # print('{}'.format(combine[1].head()))

    #     further changing features to numerical, ex sex: male -> 0, female -> 1
    sex_mapping = {"male": 0, "female": 1}
    for data_set in combine:
        data_set['Sex'] = data_set['Sex'].map(sex_mapping).astype(int)

    # print('{}'.format(combine[0].head()))

    #     we will guess NaN values of age through median,
    #     but for given record from correlation between gender and Pclass of all passengers
    # grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.6)
    # grid.map(plt.hist, 'Age', alpha=.5, bins=20)
    # grid.add_legend()
    # plt.show()

    guess_ages = np.zeros((2, 3))  # for every combination of sex and Pclass
    for data_set in combine:
        for i in [0, 1]:  # gender
            for j in [1, 2, 3]:  # Pclass
                guess_df = data_set[(data_set['Sex'] == i) &
                                    (data_set['Pclass'] == j)]['Age'].dropna()
                # alternative for median
                # age_mean = guess_df.mean()
                # age_std = guess_df.std()
                # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)
                age_guess = guess_df.median()
                guess_ages[i, j - 1] = int(age_guess / 0.5 + 0.5) * 0.5
        # now assigning computed age guesses
        for i in [0, 1]:  # gender
            for j in [1, 2, 3]:  # Pclass
                data_set.loc[(data_set.Age.isnull()) & (data_set.Sex == i) & (data_set.Pclass == j), 'Age'] = \
                    guess_ages[
                        i, j - 1]
        data_set['Age'] = data_set['Age'].astype(int)

    # print('{}'.format(train_df.head()))
    train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
    # print('{}'.format(train_df.head()))
    # print('{}'.format(
    #     train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand')
    # )
    # )

    #     replacing age values based on bands
    for data_set in combine:
        data_set.loc[data_set['Age'] <= 16, 'Age'] = 0
        data_set.loc[(data_set['Age'] > 16) & (data_set['Age'] <= 32),
                     'Age'] = 1
        data_set.loc[(data_set['Age'] > 32) & (data_set['Age'] <= 48),
                     'Age'] = 2
        data_set.loc[(data_set['Age'] > 48) & (data_set['Age'] <= 64),
                     'Age'] = 3
        data_set.loc[data_set['Age'] > 64, 'Age'] = 4

    train_df.drop(['AgeBand'], 1, inplace=True)
    combine = [train_df, test_df]

    for dataset in combine:
        dataset['FamilySize'] = dataset['SibSp'] + dataset[
            'Parch'] + 1  # creating new feature family size, by combining parent-child, sibling-spouse

    print('{}'.format(train_df[['FamilySize', 'Survived']].groupby(
        ['FamilySize'], as_index=True).agg(
            ['mean',
             'count']).reset_index().sort_values([('Survived', 'mean')],
                                                 ascending=False)))

    for dataset in combine:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    # print('{}'.format(train_df.loc[train_df['IsAlone'] == 1, ['IsAlone']].count()))

    train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
    test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)

    combine = [train_df, test_df]

    for dataset in combine:
        dataset['Age*Class'] = dataset.Age * dataset.Pclass

    # print('{}'.format(train_df.head()))
    # print('{}'.format(train_df[['Age*Class', 'Survived']].groupby(['Age*Class'], as_index=False).mean()))

    # print('{}'.format(train_df[['Embarked', 'Survived']].groupby(['Embarked']).count()))
    # print('count of all: {}'.format(train_df.count()))

    most_freq_port = train_df.Embarked.dropna().mode()[0]
    # print('{}'.format(most_freq_port))

    for dataset in combine:
        dataset['Embarked'] = dataset['Embarked'].fillna(most_freq_port)

    result = train_df[['Embarked', 'Survived']].groupby(
        ['Embarked'], as_index=False).mean().sort_values(by='Survived',
                                                         ascending=False)
    # print('{}'.format(result))

    #     converting embarked to numerical feature: S -> 0, C -> 1, Q -> 2
    for dataset in combine:
        dataset['Embarked'] = dataset['Embarked'].map({
            'S': 0,
            'C': 1,
            'Q': 2
        }).astype(int)

        # print('{}'.format(train_df.head()))

        # print('nulls in fare train: {}'.format(train_df.Fare.isnull().sum()))
        # print('nulls in fare test: {}'.format(test_df.Fare.isnull().sum()))
    #         only one missing value for fare in test_df, so we can replace that with median
    test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
    # print('nulls in fare test: {}'.format(test_df.Fare.isnull().sum()))
    # print('{}'.format(test_df.head()))
    train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
    # print(train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True))
    #     assigning fareband ordinal values based on ranges
    for dataset in combine:
        dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),
                    'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),
                    'Fare'] = 2
        dataset['Fare'] = dataset['Fare'].astype(int)

    train_df = train_df.drop(['FareBand'], axis=1)
    print('{}'.format(train_df.head()))
    combine = [train_df, test_df]
Пример #57
0
train['Embarked'].fillna('S', inplace=True)

train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes
test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes

###Family
train['Family'] = 1 + train['SibSp'] + train['Parch']
test['Family'] = 1 + test['SibSp'] + test['Parch']

###Solo
train['Solo'] = (train['Family'] == 1)
test['Solo'] = (test['Family'] == 1)

###Fare
train['FareBin'] = pd.qcut(train['Fare'], 5)
test['FareBin'] = pd.qcut(test['Fare'], 5)

#print(train['FareBin'].value_counts())

train['Fare_clean'] = train['FareBin'].astype('category').cat.codes
test['Fare_clean'] = test['FareBin'].astype('category').cat.codes

#print(train['Fare_clean'].value_counts())

###Title
train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract('([A-Za-z]+)\.', expand=False)

train['Title'] = train['Title'].replace([
    'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Majer', 'Rev', 'Sir',
Пример #58
0
def funding_table(list_of_list, df):
    colname = ["campaign_price", "campaign_people", "title"]
    table = pd.DataFrame(columns=colname)
    df1 = pd.DataFrame()
    for i in range(len(list_of_list)):
        df1 = pd.DataFrame(list_of_list[i][1])
        # df1 = df1.drop(["campaign_img", "campaign_content", "funding_price", "total_price", "ratio"], axis=1)
        title = []
        for j in range(len(df1)):
            t = list_of_list[i][0]
            title.append(t)
        df1["title"] = title
        table = pd.concat([table, df1])
    table = table.drop([
        "campaign_img", "campaign_content", "funding_price", "total_price",
        "ratio"
    ],
                       axis=1)

    table = table.sort_values(by="campaign_price")
    table.index = range(len(table))

    grouping = pd.qcut(table["campaign_price"], 10, labels=False)
    grouped = table["campaign_price"].groupby(grouping)
    test = grouped.apply(get_stats)
    bar = []
    for i in range(10):
        bar.append(str(test[i]["min"]) + "-" + str(test[i]["max"]))

    group = []
    for i in range(len(table)):
        for j in range(10):
            if (table["campaign_price"][i] >=
                    test[j][0]) & (table["campaign_price"][i] <= test[j][1]):
                group.append(j)
    table["group"] = group

    fundraisings = []
    proj_id = df["id"]
    color = [
        "#98d86d", "#61Bf81", "#61bfbf", "#79aad0", "#41709e", "#cda7dd",
        "#a286c7", "#7154c0", "#aa67d1", "#d167b2"
    ]
    for i in range(len(df)):
        print(i)
        proj = proj_id[i]
        name = df["title"][i]
        url = df["url"][i]
        fund = df["funding_target"][i]
        now_fund = df["now_funding"][i]
        fund_ratio = (now_fund / fund) * 100
        round_fund_ratio = round_up(fund_ratio)
        con = table["project"] == proj
        tab1 = table[con]
        tab1 = tab1.drop(["campaign_price", "project", "title", "id"], axis=1)
        tab1 = tab1.groupby(["group"]).sum()
        people = []
        number = tab1.index
        for j in range(10):
            if (j in number):
                people.append(int(tab1["campaign_people"][j]))
            else:
                people.append(0)

        # id_num = [1,2,3,4,5,6,7,8,9,10]

        fund = {
            "id": int(i + 1),
            "color": color[i],
            "name": name,
            "url": url,
            "data": people,
            "proportion": round_fund_ratio
        }
        fundraisings.append(fund)
    data1 = interval(df)
    minnum = data1["mininterval"]
    maxnum = data1["maxinterval"]
    chart = {
        "mininterval": int(minnum),
        "maxinterval": int(maxnum),
        "bar": bar,
        "fundraising": fundraisings
    }

    return chart
Пример #59
0
df_train[['isInfant', 'Survived']].groupby('isInfant').mean()

# In[ ]:

df_train[['isKid', 'Survived']].groupby('isKid').mean()

# In[ ]:

df_train[['isOld', 'Survived']].groupby('isOld').mean()

# now create the new 'AgeBand' feature

# In[ ]:

for df in df_combine:
    df['tmpt_AgeBand'] = pd.qcut(df['Age'], 4)
df_train.head()
df_train[['tmpt_AgeBand', 'Survived'
          ]].groupby(['tmpt_AgeBand'],
                     as_index=False).mean().sort_values(by='tmpt_AgeBand',
                                                        ascending=True)

# In[ ]:

df_train.drop(labels='tmpt_AgeBand', inplace=True, axis=1)

# In[ ]:

for df in df_combine:
    df['AgeBand'] = 0
    df.loc[df.Age <= 21, 'AgeBand'] = 0
Пример #60
0
    s = r.std(ddof=0).shift(1)
    z = (x - m) / s
    min = np.min(z)
    max = np.max(z)
    z = (z - min) / (max - min)
    z = z * 2 - 1
    return z


spy['zscore'] = zscore(spy['delta'], window=36)
spy['zscore'].plot(figsize=figsize)
plt.legend()

# %%
spy = spy.dropna()
print spy['zscore'].describe()
bottom = np.percentile(spy['zscore'], 20)
high = np.percentile(spy['zscore'], 80)
print bottom, high
# %%
plt.figure(figsize=figsize)
worst_days = spy['zscore'] < bottom
spy['price'].plot()
spy.loc[worst_days, 'price'].plot()
plt.show()
# %%
plt.figure(figsize=figsize)
spy['fwd returns'].groupby(pd.qcut(spy['zscore'], 10)).mean().plot(kind='bar')

#%%