示例#1
0
def zscore(df, index=False, datecolumn='acquisition'):

    import pandas as pd
    import numpy as np

    if index == False:
        df.index = pd.DatetimeIndex(df[datecolumn])
        df = df.drop(datecolumn, axis=1)
    else:
        df.index = pd.DatetimeIndex(df.index)

    # CORE da function
    mean = pd.groupby(df, by=[df.index.dayofyear]).aggregate(np.nanmean)
    std = pd.groupby(df, by=[df.index.dayofyear]).aggregate(np.nanstd)

    df2 = df.copy()
    for y in np.unique(df.index.year):
        for d in np.unique(df.index.dayofyear):
            df2[(df.index.year == y)
                & (df.index.dayofyear == d)] = (df[(df.index.year == y) &
                                                   (df.index.dayofyear == d)] -
                                                mean.ix[d]) / std.ix[d]
            df2.index.name = 'date'

    return df2
示例#2
0
文件: Patrimoine.py 项目: AnneDy/Til
 def conjoint(self):
     '''
     Calcule l'identifiant du conjoint et vérifie que les conjoint sont bien reciproques 
     '''     
     print ("travail sur les conjoints")
     ind = self.ind
     conj = ind.ix[ind['couple']==1,['men','lienpref','id']]
     conj['lienpref'].value_counts()
     conj.ix[conj['lienpref']==1,'lienpref'] = 0
     conj.ix[conj['lienpref']==31,'lienpref'] = 2
     conj.ix[conj['lienpref']==32,'lienpref'] = 3
     conj.ix[conj['lienpref']==50,'lienpref'] = 10
     conj2 = merge(conj, conj, on=['men','lienpref'])
     conj2 = conj2[conj2['id_x'] != conj2['id_y']]
     assert len(conj2) == len(conj)
     conj = conj2
     test = pd.groupby(conj, ['men','lienpref']).size()
     assert max(test)==2 and min(test)==2
     couple = pd.groupby(conj, 'id_x')
     for id, potential in couple:
         if len(potential) == 1:
             conj.loc[ conj['id_x']==id, 'id_y'] = potential['id_y']
         else:
             pdb.set_trace()
     # TODO: pas de probleme, bizarre
     conj = conj.rename(columns={'id_x': 'id', 'id_y':'conj'})
     ind = merge(ind,conj[['id','conj']], on='id', how='left')
     
     self.ind = ind
     ## verif sur les conj réciproque
     test_conj = merge(ind[['conj','id']],ind[['conj','id']],
                          left_on='id',right_on='conj')
     print "le nombre de couple non réciproque est:", sum(test_conj['id_x'] != test_conj['conj_y'])
     print ("fin du travail sur les conjoints")
示例#3
0
def get_avg_diff(temp, selected_feature_names):
    """ get the average difference between active / drop-off groups,
        for each active label definition.
        temp = dataframe of features with labels,
        selected_feature_names = list of names of features to be included """

    # grouping by label, for each definition of 'active'
    active_interested_group = pd.groupby(temp, by='isactive_interested')
    active_interested = active_interested_group.get_group(1)
    inactive_interested = active_interested_group.get_group(0)

    active_engaged_group = pd.groupby(temp, by='isactive_engaged')
    active_engaged = active_engaged_group.get_group(1)
    inactive_engaged = active_engaged_group.get_group(0)

    active_subscribed_group = pd.groupby(temp, by='isactive_subscribed')
    active_subscribed = active_subscribed_group.get_group(1)
    inactive_subscribed = active_subscribed_group.get_group(0)

    # extract the difference between group averages for features included in model
    mean_diff_interested = []
    mean_diff_engaged = []
    mean_diff_subscribed = []
    for i in selected_feature_names:
        mean_diff_interested.append(active_interested[i].mean() - inactive_interested[i].mean())
        mean_diff_engaged.append(active_engaged[i].mean() - inactive_engaged[i].mean())
        mean_diff_subscribed.append(active_subscribed[i].mean() - inactive_subscribed[i].mean())

    return mean_diff_interested, mean_diff_engaged, mean_diff_subscribed
def socioeconomic_ratios():
    mgra_b = pd.read_csv("mgra13_based_input2012.csv")
    mgra_sb = pd.read_csv("mgra13_based_input2012_sb.csv")
    households = pd.read_csv("households.csv")
    persons = pd.read_csv("persons.csv")

    # Manupulating the variables of interest
    cs = [1 if x == 2 else 0 for x in persons.PSTUDENT]
    ss = [1 if x == 1 else 0 for x in persons.PSTUDENT]
    emp = [1 if x == 1 or x == 2 else 0 for x in persons.PEMPLOY]

    df1 = pd.merge(pd.groupby(pd.DataFrame({
        "HHID": persons.HHID,
        "college_students": cs,
        "school_students": ss,
        "employed": emp
    }),
                              by="HHID",
                              as_index=False,
                              sort=True,
                              group_keys=True).sum(),
                   households,
                   on="HHID",
                   sort=True)

    df2 = pd.groupby(pd.DataFrame({
        "taz": df1.TAZ,
        "mgra": df1.MGRA,
        "college_students": df1.college_students,
        "school_students": df1.school_students,
        "employed": df1.employed,
        "HWORKERS": df1.HWORKERS
    }),
                     by="mgra" and "taz",
                     as_index=False,
                     sort=True)["college_students", "school_students",
                                "HWORKERS"].sum()

    df3 = pd.DataFrame({
        "mgra":
        mgra_b.mgra,
        "taz":
        mgra_b.TAZ,
        "school_enrollments":
        mgra_b.EnrollGradeKto8 + mgra_b.EnrollGrade9to12,
        "college_enrollments":
        mgra_b.collegeEnroll + mgra_b.otherCollegeEnroll + mgra_b.AdultSchEnrl,
        "emp_total":
        mgra_b.emp_total
    })

    df4 = pd.groupby(df3, by="mgra" and "taz", as_index=False,
                     sort=True)["college_enrollments", "school_enrollments",
                                "emp_total"].sum()

    ############################################################################################################
    a = df4.school_enrollments.sum() / df2.school_students.sum()
    b = df4.college_enrollments.sum() / df2.college_students.sum()
    c = df4.emp_total.sum() / df2.HWORKERS.sum()
示例#5
0
def generateNumericSummary(dat, group):
    #write your code
    std = pd.groupby(dat, group).std()
    nums_missing = dat.shape[0] - dat.count()
    means = pd.groupby(dat, group).mean()

    dic = {'std': std, 'numMissing': nums_missing, 'mean': means}

    return dic
示例#6
0
def bayesMean(dt_in, dt_out, t_col = "brand", y_col = "target"):
    mean_dict  = pd.groupby(dt_in[[t_col, y_col]], t_col).mean().to_dict()[y_col]
    ct_dict   = pd.groupby(dt_in[[t_col, y_col]], t_col).count().to_dict()[y_col]
    glbmean  = dt_in[y_col].values.mean()
    def bMeanSngl(vc, vm, glbmean = glbmean, prior = 5):
        return ((vc*vm)+(prior*glbmean))/(vc+prior)
    bmean_dict = dict((kc, bMeanSngl(vc, vm)) for ((kc, vc), (km, vm)) in \
                      zip(ct_dict.iteritems(), mean_dict.iteritems()))
    out = dt_out[t_col].apply(lambda x : bmean_dict.get(x, glbmean)).values
    return out
示例#7
0
def create_freq_feats(data, column_name):
    '''
    求列的频率特征,也就是求对应列每周的平均值
    '''
    freq_feat = column_name + '_freq'
    print('Creating frequency feature: %s' % freq_feat)
    # 列+周的target计数
    freq_frame = pd.groupby(data, [column_name, 'Semana'])['target'].count().reset_index()
    freq_frame.rename(columns={'target': freq_feat}, inplace=True)
    # 计算平均值
    freq_frame = pd.groupby(freq_frame, [column_name])[freq_feat].mean().reset_index()

    # 将平均值join回原来的data中
    return pd.merge(data, freq_frame, how='left', on=[column_name], left_index=False,
                    right_index=False, suffixes=('', '_freq'), copy=False)
示例#8
0
def add_value(target_df, data_df, group_col, typ, dropbox_path):
    '''
    Help function for regression_data
    '''
    df_ls = []
    grouped = pd.groupby(target_df, group_col)

    for key, sub_df in grouped:
        if typ == 'swap':
            nation_swap = data_df[data_df.Currency ==
                                  NATION_CURRENCY_DICT[key]]
            df_ls.append(
                sub_df.join(nation_swap[['5Y', 'Butterfly 5y', 'Curve 5y']]))
        elif typ == 'credit':
            credit_df = pd.read_csv(
                dropbox_path + 'cleaned data/Monthly credit spread curves/' +
                CREDIT_DICT[key],
                parse_dates=True,
                infer_datetime_format=True)
            credit_df.Date = pd.to_datetime(credit_df.Date,
                                            infer_datetime_format=True)
            credit_df.set_index('Date', inplace=True)

            df_ls.append(sub_df.join(credit_df['5Y']))
    return pd.concat(df_ls)
示例#9
0
def campaign_count(request):
    # Chart data is passed to the `dataSource` parameter, as dict, in the form of key-value pairs.
    data_source = dict()
    CHART["caption"] = "Total campaign registrations"
    data_source['chart'] = CHART

    data_source['data'] = []

    my_campaigns = [c for c in Campaign.objects.filter(removed=False)]

    data = pd.DataFrame()
    data['id'] = [c.pk for c in my_campaigns]
    data['created_at'] = [c.created_at for c in my_campaigns]
    data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format(
        y=date.year, m=get_month_format(date.month)))
    data.sort_values(by=['month'], inplace=True)

    gp = pd.groupby(data, by='month').aggregate({'id': 'count'})
    gp = pd.DataFrame(gp)

    for idx, row in gp.iterrows():
        data = dict()
        data['label'] = idx
        data['value'] = str(row['id'])
        data_source['data'].append(data)

    # Create an object for the Column 2D chart using the FusionCharts class constructor
    column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1",
                             "json", data_source)
    return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
示例#10
0
def plot_deliveries_by_team():
    team_deliveries = ipl_df[['batting_team', 'delivery']]
    ipl_bat_group = pd.groupby(team_deliveries, by='batting_team')
    # print(team_deliveries)
    # print(ipl_bat_group.count().head())
    ipl_bat_group.plot(kind='bar')
    plt.show()
示例#11
0
    def analysis(self):
        bin_dict = {i: pd.DataFrame() for i in xrange(1, self.num_of_bins+1, 1)}

        grouped = pd.groupby(self.research_data.data, by=[self.research_data.data.date])

        for time_stamp, group in grouped:
            # Filter the input data
            group = group[group[self.alpha.name] != 0]
            group = group.dropna()
            group = group.sort_values(self.alpha.name)

            # Partition daily data into n bins
            partitions = np.array_split(group, self.num_of_bins)
            for i in xrange(1, self.num_of_bins+1, 1):
                bin_dict[i] = bin_dict[i].append(partitions[i-1])

        # Alpha/Return analysis
        for i in xrange(1, self.num_of_bins+1, 1):
            alpha_bin = pd.DataFrame({
                'bin': [i],
                self.alpha.name: [bin_dict[i][self.alpha.name].mean()],
                'return': [bin_dict[i]['return'].mean()*10000]
            })
            self.alpha_return = self.alpha_return.append(alpha_bin[['bin', self.alpha.name, 'return']],
                                                         ignore_index=True)
        self.alpha_return = self.alpha_return.set_index('bin')

        self._plot()
        return self.alpha_return
示例#12
0
def recommended_candidates(request):
    """
    Recommended
    """

    data_source = dict()
    CHART["caption"] = "Recommended candidates"
    data_source['chart'] = CHART

    columns = ['id', 'created_at']
    data = pd.DataFrame(list(
        Candidate.objects.filter(state__code__in=['GTJ', 'STC'],
                                 removed=False).values_list(*columns)),
                        columns=columns)
    data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format(
        y=date.year, m=get_month_format(date.month)))
    data.drop('created_at', inplace=True, axis=1)

    gp = pd.groupby(data, by='month').aggregate({'id': 'count'})
    data = pd.DataFrame(gp)
    data.sort_index(inplace=True)

    data_source['data'] = []
    for idx, row in data.iterrows():
        data_source['data'].append({'label': idx, 'value': str(row['id'])})

    # Create an object for the Column 2D chart using the FusionCharts class constructor
    column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1",
                             "json", data_source)
    return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
示例#13
0
def get_unique_users_registrations(request):
    """
    Unique users registered per month
    """

    data_source = dict()
    CHART["caption"] = "Unique user registrations"
    data_source['chart'] = CHART

    columns = ['id', 'created_at']
    data = pd.DataFrame(list(User.objects.all().values_list(*columns)),
                        columns=columns)
    data['month'] = data['created_at'].apply(lambda date: '{y}-{m}'.format(
        y=date.year, m=get_month_format(date.month)))
    data.drop('created_at', inplace=True, axis=1)

    gp = pd.groupby(data, by='month').aggregate({'id': 'count'})
    data = pd.DataFrame(gp)
    data.sort_index(inplace=True)

    data_source['data'] = []
    for idx, row in data.iterrows():
        data_source['data'].append({'label': idx, 'value': str(row['id'])})

    # Create an object for the Column 2D chart using the FusionCharts class constructor
    column_2d = FusionCharts("column2D", "ex1", "600", "350", "chart-1",
                             "json", data_source)
    return render(request, cts.STATS_INDEX, {'output': column_2d.render()})
def generate_conv_timestamp():
    #add timestamp of conversion for each user
    path = r'C:\Users\sesig\Documents\master data science\tfm\r_dataset_cleaned\data_all_1u.csv'
    data = pd.read_csv(filepath_or_buffer=path, sep=',')
    data_grouped = pd.groupby(data, by='uid')
    nuser = pd.Series.nunique(data['uid'])
    x = pd.DataFrame(
        data={
            'uid': np.arange(nuser, dtype=np.int_),
            'tconv': np.zeros(nuser, dtype=np.float_)
        })

    path_params = r'C:\Users\sesig\Documents\master data science\tfm\criteo_cleaned_data\gamma_dist_params.csv'
    channel_params = pd.read_csv(filepath_or_buffer=path_params, sep=',')

    i = 0
    for name, group in data_grouped:
        if group.iloc[0, 2] == 1:
            ch = group.iloc[-1, 1]
            a = channel_params.loc[ch, 'shape parameter']
            loc = channel_params.loc[ch, 'location parameter']
            scale = channel_params.loc[ch, 'scale parameter']
            x.loc[i, 'tconv'] = group.iloc[-1, 3] + stats.gamma.rvs(
                a, loc=loc, scale=scale, size=1, random_state=i)
            i += 1
        else:
            x.loc[i, 'tconv'] = group.iloc[-1, 3] + 15
            i += 1

    path_out = r'C:\Users\sesig\Documents\master data science\tfm\r_dataset_cleaned\r_dataset_tconv.csv'
    pd.DataFrame.to_csv(x, path_or_buf=path_out, sep=',', index=False)
示例#15
0
def get_number_of_unique_users():
    """
    select date_trunc('month', created_at) m,
      count(distinct user_id) unique_users
    from candidates
    where state_id!=11
    and not removed
    group by m
    order by m;
    """

    first_candidate_columns = ['user_id', 'user__created_at']
    data = pd.DataFrame(list(
        Candidate.objects.filter(
            ~Q(state__in=get_prospect_states()),
            removed=False).values_list(*first_candidate_columns)),
                        columns=first_candidate_columns)

    data['month'] = data['user__created_at'].apply(
        lambda date: '{y}-{m}'.format(y=date.year,
                                      m=get_month_format(date.month)))
    data.drop('user__created_at', inplace=True, axis=1)

    gp = pd.groupby(data, by='month').aggregate({'user_id': pd.Series.nunique})
    data = pd.DataFrame(gp)

    return data
示例#16
0
def get_reinspection_current_count(bbh):
    """Counts the number of hangers inside the reinspection
    at every timestep from a BitBusHist dataframe.

    Returns a three-tuple:
        counter: Series, index like bbh, values are number of carcasses
            in reinspection at the given time.
        irregulars: list of uids that do not conform to expectations
        leftovers: list of uids that are not registered as leaving
            the reinspection
    """

    # Curious note to self:
    # It seems that np.in1d is about 3 times faster than pd.ser.isin

    bbh = bbh.sort_index().reset_index()


    inside = set()
    irregulars = []

    bbh['movements'] = np.in1d(bbh.Tx.values, REINSPECTION_TX_IN).astype(int)\
                     - np.in1d(bbh.Tx.values, REINSPECTION_TX_OUT).astype(int)

    n_uids = len(bbh.uids.unique())
    inspect = []
    for uid, vals in IProgressBar(bbh.groupby('uids'), n_uids):
        s = vals.movements.sum()
        if s != 0:
            inspect.append(uid)

    # Use multiprocessing.Pool.map here to examine the uids
    # in `inspect`



    leaving = bbh.Tx.isin(REINSPECTION_TX_OUT)
    moves = bbh.Tx.isin(REINSPECTION_TX_IN) - leaving
    defleft = bbh.Tx.isin(REINSPECTION_TX_DEFINITELY_OUT)


    for (uid, leave), (uid, left) in \
        zip(pd.groupby(leaving, by=bbh.uids),
            pd.groupby(defleft, by=bbh.uids)):
        pass # TODO

    counter = (entering - leaving).cumsum()
def group_data( table ):
	groups = []
	for key, group in pd.groupby( table, lambda x: x[1] ):
		total = 0
		for item in group:
			total += int( item[2] )

		groups.append( group[0][1],total )
 def plot_yrly_result(self, **kwargs):
     df = self.data[['result'] + self.benchmarks]
     df = df + 1
     df = pd.groupby(df, by=[df.index.year]).prod()
     df = df - 1
     df = df * 100
     df.rename(columns={'result': self.name}, inplace=True)
     df.plot.bar(legend=True, **kwargs)
示例#19
0
文件: ts.py 项目: ecustzhy/test
def groupby_year_month(df):
    """ Groups a pandas `DataFrame` by year and month.

    :param df: A pandas `DataFrame`.

    :returns: The grouped `DataFrame`.
    """
    return pd.groupby(df, by=[df.index.year, df.index.month])
def plotEQCountByMonth(df):
    pdg = pd.groupby(df, by=[df.index.month, df.index.year])
    plot = pdg.count()[['code'
                        ]].plot(kind='bar',
                                legend=False,
                                title="Count of Earthquakes by Month in 2016")
    plot.set(xlabel="Months", ylabel="No. of EarthQuakes")
    plt.show()
示例#21
0
def gen_normalize_by_month_1(stock,pct,month):
    pct=pct.ix[pct.index.month==month]
    dataGroup = pd.groupby(pct,by=[pct.index.month,pct.index.year])
    for key in dataGroup.groups :
        _month = dataGroup.get_group(key)
        _month = (_month + 1).cumprod()
        _month['normalized']=(_month[stock]-_month['spx']) + 1
        yield key, _month, _month['normalized']
示例#22
0
文件: Patrimoine.py 项目: AnneDy/Til
    def enfants(self):   
        '''
        Calcule l'identifiant des parents 
        '''    
        ind = self.ind
        enf = ind.ix[ ind['enf'] != 0 ,['men','lienpref','id','enf']]
        enf0 = enf[enf['enf'].isin([1,2])]
        enf0['lienpref'] = 0
        enf0 = merge(enf0, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_1'))
        
        enf1 = enf[enf['enf'].isin([1,3])]
        enf1['lienpref'] = 1
        enf1 = merge(enf1, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_2'))
              
        # cas des petits-enfants : on cherche les enfants de la personne de référence (enf=1,2 ou 3) et on tente de les associer 
        # aux petits enfants (lienpref=31)
        # en toute rigueur, il faudrait garder un lien si on ne trouve pas les parents pour l'envoyer dans le registre...
        # et savoir que ce sont les petites enfants (pour l'héritage par exemple)
        par4 = enf[enf['enf'].isin([1,2,3])]
        par4['lienpref'] = 21
        par4 = merge(par4, ind[['men','lienpref','id']], on=['men','lienpref'], how='inner', suffixes=('_4', ''))
        enf4 = DataFrame( index=par4['id'].unique(), columns=['id_1','id_2'], dtype=np.int32)
        parents = pd.groupby(par4, 'id')
        for idx, parent in parents:
            id = int(idx)
            if len(parent) == 1:
                enf4['id_1'][id] = int(parent['id_4'])
            else:
                # cas à résoudre "à la main"
                potential = ind.loc[parent['id_4'], ['anais','lienpref','sexe','couple','conj']]
                potential = potential[ind.loc[id,'anais'] - potential['anais'] > 16 ]
                pot_mother = potential[potential['sexe'] ==2 ]
                if len(pot_mother):
                    par =  pot_mother['anais'].idxmin()
                else: 
                    par =  potential['anais'].idxmin()
                enf4['id_1'][id] = par
        
        enf4['id'] = enf4.index
        enf4['id_2'] = ind.ix[enf4['id_1'],'conj'].values

        enf = merge(enf0[['id','id_1']],enf1[['id','id_2']], how='outer')
        enf = enf.append(enf4[['id','id_1','id_2']])       
        enf = merge(enf,ind[['id','sexe']], left_on='id_1', right_on='id', how = 'left', suffixes=('', '_'))
        del enf['id_']
    
        enf['pere'] = Series(dtype=np.int32)
        enf['pere'][enf['sexe']==1] = enf['id_1'][enf['sexe']==1] 
        enf['mere'] = Series(dtype=np.int32)
        enf['mere'][enf['sexe']==2] = enf['id_1'][enf['sexe']==2] 
        
        cond_pere = notnull(enf['mere']) & notnull(enf['id_2'])
        enf['pere'][cond_pere] = enf['id_2'][cond_pere]
        cond_mere = ~notnull(enf['mere']) & notnull(enf['id_2'])
        enf['mere'][cond_mere] = enf['id_2'][cond_mere]
        #sum(sexe1==sexe2) 6 couples de parents homosexuels
        ind = merge(ind,enf[['id','pere','mere']], on='id', how='left')
        self.ind = ind
示例#23
0
def cat_report():
    '''
	prints longitudinal expenditure per category
	'''
    c = consolidate('/statements')
    d = c[c['Transaction_Ref2'] != 'LeeEJ'].set_index('Transaction_Date')
    e = pd.groupby(d, by=['Transaction_Ref1', d.index.year,
                          d.index.month]).sum()
    return e
示例#24
0
def function_over_events(function, dataframe, branch_selection=None, **kwargs):
    """Generator which yields `function(event, **kwargs)` of each processed data event in dataframe
    """
    for run_number, events in pd.groupby(dataframe, 'run_number'):
        yield from function_results_datasets(run_number,
                                             function,
                                             events.event_number.values,
                                             branch_selection=branch_selection,
                                             kwargs=kwargs)
def groupby_reset(col):
    colname = "'%s'" % col
    df = (
        pd.groupby([colname]).sum()
        .sort_values('Global_Sales', ascending = False)
        .reset_index(col_level = 1)
    )
    
    return df
示例#26
0
def make_propspertext(distrawcounts, label):
    distrawcounts = distrawcounts.T
    segids = list(distrawcounts.index)
    distrawcounts["idnos"] = [item[0:6] for item in segids]
    # print(distrawcounts.head())
    rawcountspertext = pd.groupby(distrawcounts, "idnos")
    distpropspertext = rawcountspertext.aggregate(np.mean)
    distpropspertext["label"] = label
    # print(distpropspertext.head())
    return distpropspertext
示例#27
0
def create_hist_data(evts, limit, max_top=10):
    nevts = evts.iloc[0:limit]
    tmp = pd.groupby(nevts, by=[nevts.index.month]).count()
    tmp['real_top'] = max_top * tmp.sdt/max(tmp.sdt)
    res = []

    for i in range(2, 7):
        res.append(tmp['real_top'].get(i, 0))

    return res
示例#28
0
def create_hist_data(evts, limit, max_top=10):
    nevts = evts.iloc[0:limit]
    tmp = pd.groupby(nevts, by=[nevts.index.month]).count()
    tmp['real_top'] = max_top * tmp.sdt / max(tmp.sdt)
    res = []

    for i in range(2, 7):
        res.append(tmp['real_top'].get(i, 0))

    return res
示例#29
0
    def enfants(self):   
        '''
        Calcule l'identifiant des parents 
        '''    
        ind = self.ind
        print("travail sur les enfants")
        enf = ind.ix[ ind['enf'] != 0 ,['men','lienpref','id','enf']]
        enf0 = enf[enf['enf'].isin([1,2])]
        enf0['lienpref'] = 0
        enf0 = merge(enf0, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_1'))
        
        enf1 = enf[enf['enf'].isin([1,3])]
        enf1['lienpref'] = 1
        enf1 = merge(enf1, ind[['men','lienpref','id']], on=['men','lienpref'], how='left', suffixes=('', '_2'))
        
        #pour les petits enfants, on renverse, on selectionne, les enfants qui seront des 
        #parents pour les petits-enfants
        print("cas des petits-enfants")
        enf4 = enf[enf['enf'].isin([1,2,3])]
        enf4['lienpref'] = 21
        enf4 = merge(enf4, ind[['men','lienpref','id']], on=['men','lienpref'], how='inner', suffixes=('_4', ''))
        enf4['id_1'] = Series()
        enf4['id_2'] = Series()
        parents = pd.groupby(enf4, 'id')
        for id, parent in parents:
            if len(parent) == 1:
                enf4.loc[ enf4['id']==id, 'id_1'] = parent['id_4']
            elif len(parent) == 2:
                enf4.loc[ enf4['id']==id, 'id_1'] = parent['id_4'].values[0]
                enf4.loc[ enf4['id']==id, 'id_2'] = parent['id_4'].values[1]
            else:
                # cas à résoudre
        #         print(ind.ix[ind['men']==parent['men'].values[0],['age','lienpref']])
                enf4.ix[ enf4['id']==id, 'id_1'] = 15043

        enf = merge(enf0[['id','id_1']],enf1[['id','id_2']], how='outer')
        enf = enf.append(enf4[['id','id_1','id_2']])
        
        enf = merge(enf,ind[['id','sexe']], left_on='id_1', right_on='id', how = 'left', suffixes=('', '_'))
        del enf['id_']
        
        enf['pere'] = Series()
        enf['pere'][enf['sexe']==1] = enf['id_1'][enf['sexe']==1] 
        enf['mere'] = Series()
        enf['mere'][enf['sexe']==2] = enf['id_1'][enf['sexe']==2] 
        
        cond_pere = notnull(enf['mere']) & notnull(enf['id_2'])
        enf['pere'][cond_pere] = enf['id_2'][cond_pere]
        cond_mere = ~notnull(enf['mere']) & notnull(enf['id_2'])
        enf['mere'][cond_mere] = enf['id_2'][cond_mere]
        #sum(sexe1==sexe2) 6 couples de parents homosexuels
        ind = merge(ind,enf[['id','pere','mere']], on='id', how='left')
        print("fin du travail sur les enfants")
        self.ind = ind
示例#30
0
def monthly_report():
    c = consolidate('/statements')
    d = c[c['Transaction_Ref2'] != 'LeeEJ'].set_index('Transaction_Date')
    e = pd.groupby(d, by=[d.index.year, d.index.month])
    savings = e['Credit_Amount'].sum() - e['Debit_Amount'].sum()
    print e.sum()
    print e.sum().sum()
    print savings
    print savings.sum()
    # savings.plot(style='o')
    # plt.show()
    return e
示例#31
0
文件: nb.py 项目: nahmiasd/KaggleWNV
def aggregate_num_mosquitos(train,test):
        num_by_trap = pd.groupby(train[['Trap', 'NumMosquitos', 'WnvPresent']], 'Trap').agg('sum')
        num_by_trap['trap_percent_of_all_mosquitos'] = num_by_trap['NumMosquitos']/sum(num_by_trap.NumMosquitos)
        num_by_trap['trap_percent_with_wnv'] = num_by_trap.WnvPresent/num_by_trap.NumMosquitos
        num_by_trap.reset_index(inplace=True)
        map_mosq_weight = {t:v for t, v in zip(num_by_trap.Trap.values, num_by_trap['trap_percent_of_all_mosquitos'].values)}
        map_wnv_weight = {t:v for t, v in zip(num_by_trap.Trap.values, num_by_trap['trap_percent_with_wnv'].values)}
        train['trap_mosq_rate'] = train.Trap.map(map_mosq_weight)
        train['trap_wnv_rate'] = train.Trap.map(map_wnv_weight)
        test['trap_mosq_rate'] = test.Trap.map(map_mosq_weight).fillna(0)
        test['trap_wnv_rate'] = test.Trap.map(map_wnv_weight).fillna(0)
        return train,test
示例#32
0
def target_encoding(df, columns, target, new_column = False):
    """
    Encodes a categorical feature as its target mean
    """
    for column in columns:
        
        group = pd.groupby(df[[column, target]], column).mean()
        new_column_name = column
        if new_column:
            new_column_name = column + "_target_encoding"
        df[new_column_name] = df[column].apply(lambda x : group[group.index == x][target].values[0])
        return df
示例#33
0
def create_hist(evts, max_top=10):
    main_grpd = pd.groupby(evts, by=[evts.index.month]).count()
    main_grpd['i'] = main_grpd.index
    main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1),
                                        axis=1)
    main_grpd['right'] = pd.date_range(
        start=main_grpd['left'][main_grpd.index[0]],
        periods=len(main_grpd),
        freq='M')
    main_grpd['top'] = np.zeros(len(main_grpd.index))
    main_grpd['bottom'] = np.zeros(len(main_grpd.index))
    main_grpd['real_top'] = max_top * main_grpd.sdt / max(main_grpd.sdt)
    return main_grpd
def parallelize_on_flid_save_to_db(df_in, func):
    """group the dataframe by icao address
    and process these using parallelization"""

    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    print("Number of unique icaos: %d", len(df_in['flight_id'].unique()))

    res = pool.map(
        func, [group for name, group in pd.groupby(df_in, by=['flight_id'])])
    pool.close()
    pool.join()

    return True
示例#35
0
def create_hist(evts, max_top=10):
    main_grpd = pd.groupby(evts, by=[evts.index.month]).count()
    main_grpd['i'] = main_grpd.index
    main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1), axis=1)
    main_grpd['right'] = pd.date_range(
        start=main_grpd['left'][main_grpd.index[0]],
        periods=len(main_grpd),
        freq='M'
    )
    main_grpd['top'] = np.zeros(len(main_grpd.index))
    main_grpd['bottom'] = np.zeros(len(main_grpd.index))
    main_grpd['real_top'] = max_top * main_grpd.sdt/max(main_grpd.sdt)
    return main_grpd
示例#36
0
def create_hist_layers(evts, max_top):
    """
    Takes a dataframe of events (where index contains the datetime of each evt
    and return new groupby.count() object with the following columns to build
    an histogram:
        left, right, bottom, top, real_top
    """
    try:
        main_grpd = pd.groupby(evts, by=[evts.index.month]).count()
        main_grpd['i'] = main_grpd.index
        main_grpd['left'] = main_grpd.apply(lambda x: dt.datetime(2014, x.i, 1), axis=1)
        main_grpd['right'] = pd.date_range(
            start=main_grpd['left'][main_grpd.index[0]],
            periods=len(main_grpd),
            freq='M'
        )
        main_grpd['top'] = np.zeros(len(main_grpd.index))
        main_grpd['bottom'] = np.zeros(len(main_grpd.index))
        main_grpd['real_top'] = max_top * main_grpd.sdt/max(main_grpd.sdt)

    except AttributeError:
        # evts is not a DataFrame, most likely case there are no evts
        pass

    for i in range(1, len(evts)+1):
        nevts = evts.iloc[0:i]

        # build a temp dh to compute this layer histogram counts
        tmp= pd.groupby(nevts, by=[nevts.index.month]).count()
        tmp['real_top'] = tmp.sdt/max(tmp.sdt)

        # copy the original df so we have all basic config already set
        grpd = main_grpd.copy()

        for ind in grpd.index:
            grpd.real_top[ind] = tmp.real_top.get(ind, 0)

        yield grpd
示例#37
0
def create_hist_layers(evts, max_top):
    """
    Takes a dataframe of events (where index contains the datetime of each evt
    and return new groupby.count() object with the following columns to build
    an histogram:
        left, right, bottom, top, real_top
    """
    try:
        main_grpd = pd.groupby(evts, by=[evts.index.month]).count()
        main_grpd['i'] = main_grpd.index
        main_grpd['left'] = main_grpd.apply(
            lambda x: dt.datetime(2014, x.i, 1), axis=1)
        main_grpd['right'] = pd.date_range(
            start=main_grpd['left'][main_grpd.index[0]],
            periods=len(main_grpd),
            freq='M')
        main_grpd['top'] = np.zeros(len(main_grpd.index))
        main_grpd['bottom'] = np.zeros(len(main_grpd.index))
        main_grpd['real_top'] = max_top * main_grpd.sdt / max(main_grpd.sdt)

    except AttributeError:
        # evts is not a DataFrame, most likely case there are no evts
        pass

    for i in range(1, len(evts) + 1):
        nevts = evts.iloc[0:i]

        # build a temp dh to compute this layer histogram counts
        tmp = pd.groupby(nevts, by=[nevts.index.month]).count()
        tmp['real_top'] = tmp.sdt / max(tmp.sdt)

        # copy the original df so we have all basic config already set
        grpd = main_grpd.copy()

        for ind in grpd.index:
            grpd.real_top[ind] = tmp.real_top.get(ind, 0)

        yield grpd
示例#38
0
def calib_func(x,data,buckets,option,minfactor,maxfactor):
    if(option == 'no'):
        minfactor = np.percentile(data['factor1'].values,2)
        maxfactor = np.percentile(data['factor1'].values,98)
        
    #Limit movement outside model bounds 
    if(option == 'yes'):
        data['factor1'][data['factor1']>=maxfactor] = maxfactor
        data['factor1'][data['factor1']<=minfactor] = minfactor

    #The second layer of this simplfied neural network controls the trend load
    factorload2 = 1.0/(1.0+np.exp(-x[3]*data['range']/data['duration']))
    #Scale this layer without allowing fixed movement
    factorload = x[2]*factorload2 - x[2]/2.0

    #Layer 1 - Main classification factor of neural network
    z = 1.0/(1.0+np.exp(-factorload * x[1] * data['factor1']))
    
    #Sigmoid function is used for classification
    adjustment_series = pd.Series(z)
    #Scale the model
    adjustment_series = x[0]*adjustment_series - x[0]/2
    newprice = data['bs_probability'] + adjustment_series.values

    data['newprice']=newprice
    data['newpnl']=0.0
    data['newpnl'] = data['wins']-data['newprice']
    if(option=='yes'):
        data['factorload']=x[2]*factorload2 - x[2]/2.0
        data['factorload2']=factorload2
        return 1.0
    d1=np.repeat(0.0,7)
    i=0
    ranges = pd.groupby(data,pd.qcut(data['range'],7))
    for mname,m_data in ranges:
        d1[i]=np.mean(pd.groupby(m_data,by=pd.qcut(m_data['factor1'],7)).mean()['newpnl']**2)
        i=i+1
    return np.mean(d1)**0.5
示例#39
0
def extend(data, treemakers):
    """Extends the dataframe data by loading treemakers for the remaining events
    See https://github.com/XENON1T/hax/pull/52 for more information.

    :param data: dataframe, assumed to be event-per-row

    :param treemakers: list of treemakers to load
    """
    new_minitrees = []
    for run_number, events in pd.groupby(data, 'run_number'):
        new_minitrees.append(load_single_dataset(run_number, treemakers, event_list=events.event_number.values)[0])
    result = _merge_minitrees(data, pd.concat(new_minitrees))
    result.cut_history = data.cut_history
    return result
示例#40
0
 def fit(self, df, y=None):
     N = df.shape[0]
     min_freq = max(self.min_freq, np.ceil(N * self.min_freq_ratio))
     self.mappers_ = {}
     self.features_ = list(df.columns)
     for col in self.features_:
         self.mappers_[col] = {}
         group_by = pd.groupby(y, by=df[col])
         group_by_count = group_by.count()
         active_groups = group_by_count[group_by_count >= min_freq].index
         if self.report_freq_ratio:
             self.mappers_[col]['freq_ratio'] = (group_by_count / N)[active_groups].to_dict()
         for agg in self.aggregators:
             self.mappers_[col][agg] = group_by.agg(agg)[active_groups].to_dict()
     return self
示例#41
0
def process_delta(delta, name):
    # type: (pd.Series, str) -> pd.Series
    """Take a delta of a column
    """
    delta_per_month_gb = pd.groupby(
        delta, by=[delta.index.year, delta.index.month]
    )
    # Drop the first month
    # Shifts turn dtype into floating point, return to int
    delta_per_month = delta_per_month_gb.sum().iloc[1:]
    delta_per_month.index.name = "Month"
    delta_per_month.index = delta_per_month.index.to_series().apply(
        pretty_month)
    delta_per_month.name = name
    return delta_per_month
示例#42
0
def parallelize_on_icao(df_in, func):
    """group the dataframe by icao address and process \
    these using parallelization"""

    pool_cpu_size = cpu_count
    pool = multiprocessing.Pool(pool_cpu_size)
    print("Number of unique icaos: %d", len(df_in['icao'].unique()))

    df_processed = pd.concat(
        pool.map(func,
                 [group for name, group in pd.groupby(df_in, by=['icao'])]))
    pool.close()
    pool.join()

    return df_processed
示例#43
0
def add_value(target_df, data_df,  group_col, typ):
    df_ls = []
    grouped = pd.groupby(target_df, group_col)
	
    for key, sub_df in grouped:
        if typ == 'swap':
            nation_swap = data_df[data_df.Currency == NATION_CURRENCY_DICT[key]]
            df_ls.append(sub_df.join(nation_swap[['10Y', 'Butterfly 10y', 'Curve 10y']]))
        elif typ == 'credit':
            credit_df = pd.read_csv(ROOT_DIR + 'cleaned data/Monthly credit spread curves/' + CREDIT_DICT[NATION_CURRENCY_DICT[key]],parse_dates = True, infer_datetime_format=True )
            credit_df.Date = pd.to_datetime(credit_df.Date, infer_datetime_format = True)
            credit_df.set_index('Date', inplace = True)
                
            df_ls.append(sub_df.join(credit_df['10Y']))
    return pd.concat(df_ls)
def equity_resharp(newtotalpo):
	deltatime=offsets.DateOffset(hours=6)
	newtotalpo['stockdate']=newtotalpo['stockdate']+deltatime
	newtotalpo['day']=newtotalpo['stockdate'].apply(lambda x: x.strftime('%Y%m%d'))
	mydailposition=newtotalpo[newtotalpo['stockdate'].apply(lambda x: x.strftime('%H%M%S'))=='205900'][['stockdate','totalposition']]
	mydailposition['day']=mydailposition['stockdate'].apply(lambda x: x.strftime('%Y%m%d'))
	mydailposition['lastdayposition']=mydailposition['totalposition']
	mydailposition=mydailposition[['day','lastdayposition']]
	newtotalpo['deltaposition']=abs(newtotalpo['totalposition']-newtotalpo['totalposition'].shift())
	day_equity=pd.groupby(newtotalpo,'day').sum()
	day_equity['day']=day_equity.index
	day_equity=pd.merge(day_equity,mydailposition,how='left',on='day')
	day_equity['lastdayposition']=day_equity['lastdayposition'].fillna(method='ffill')


	return day_equity[['profit','comm','equity','deltaposition','lastdayposition','day']]
示例#45
0
	def run(self):
		infiles = self.Config.append_dir("MakeDrugPairsIn")
		outfiles = self.Config.append_dir("MakeDrugPairsOut")

		for (infile,outfile) in zip(infiles,outfiles):
			data = self.loadDF(infile)
			if not data:
				continue
			
			data['ChemID'] = data[self.Config.keys['bnf']].map(lambda x: x[0:9])
			data = util.sumBy(data,[self.Config.keys['practice'],'ChemID',self.Config.keys['gen'],'postal code'])
			
			grouped = pandas.groupby(data,self.Config.keys['gen'])
			data = pandas.merge(grouped.get_group(1.0),grouped.get_group(0.0),
				on =[self.Config.keys['practice'],'ChemID'], 
				left_index=False, 
				right_index = False,
				how = 'outer',
				sort = False,
				suffixes = ('_gen','_brand'))

			for col in data.columns.values.tolist():
				data[col] = data[col].map(lambda x: 0 if x!=x else x)

			data['postal code'] = data.apply(
				lambda row: row['postal code_gen'] 
				if row['postal code_brand']!=row['postal code_brand'] 
				else row['postal code_brand'],
				axis=1)

			items = self.Config.keys['items']
			quan = self.Config.keys['quantity']
			nic = self.Config.keys['nic']
			cols = [items,quan,nic]

			for col in cols:
				data['sum'+col] = data[col+'_brand']+data[col+'_gen']
				data['percent'+col]= data[col+'_brand']/data['sum'+col]

			data = data.drop(['INCLUDE_gen','INCLUDE_brand',
				'GENERIC_gen','GENERIC_brand',
				'postal code_gen','postal code_brand'], 
				axis=1)

			data.to_csv(outfile, index = False)
def equity_resharp_huibao(newtotalpo):
	deltatime=offsets.DateOffset(hours=6)
	newtotalpo['stockdate']=newtotalpo['stockdate']+deltatime
	newtotalpo['day']=newtotalpo['stockdate'].apply(lambda x: x.strftime('%Y%m%d'))
	day_equity=pd.groupby(newtotalpo,'day').sum()
	return day_equity[['profit','comm','equity']]




	pass






	##############################
	pass
def get_benchmark_from_db(type):
	db = MySQLdb.connect("192.168.51.100","PASS_DEV","Develop-2015","PASS_SYS")
	if type == 'Top':
		symbol = "PERF-GLOBAL"
	elif type == 'Equity': 
		symbol = "SPX INDEX"

	query = "select HD_PK from PASS_SYS.V_SERIE where ST_SECURITY_CODE='%s'"  % (symbol)
	select_0 = pd.read_sql(query, db, coerce_float = False)
	query1 = "select DT_DATE, NU_PX_LAST from PASS_SYS.V_MKTDATA where LK_SERIE = unhex('%s')" % (select_0.values[0][0].encode('hex'))
	bench = pd.read_sql(query1, db, index_col = 'DT_DATE')
	# calculation of benchmark standard deviation by month 
	bench_sharp = bench 
	bench_sharp['Returns'] = bench.pct_change()
	bench_sharp = bench_sharp['Returns']
	bench_st_deviation = pd.groupby(bench_sharp, by=[bench_sharp.index.year,bench_sharp.index.month]).apply(lambda rets: np.std(rets))
	# calculation of benchmark return by month 
	bench = bench.resample("M").ffill()
	bench['Returns'] = bench.pct_change()
	bench = to_multindex(bench)
	bench_returns = bench['Returns'].dropna() 
	bench_sharpe_ratio = bench_returns.to_frame(name = 'Returns') / bench_st_deviation.to_frame(name = 'Returns')
	return bench_returns, bench_sharpe_ratio
示例#48
0
def combine_columns(dfOrig, codeName = 'SEO.Code.', prcName = 'SEO.Percentage.', codeRange = 5, index = 'Grant.Application.ID'):
    """ Goes through all codeNames + codeRange, , impute '99' to blanks, get_dummies on them, drops colums with code 0 and
    add up each throughout the range."""
    df = dfOrig.copy()
    cleanDf = df[['{}{}'.format(codeName, i) for i in range(1, codeRange+1)]].fillna(990000) // 10000
    cleanDf[index] = df[index]
    dummyDf = []
    for i in range(1, codeRange+1):
        cleanDf['{}{}'.format(prcName, i)] = df['{}{}'.format(prcName, i)]
        currDummy = cleanDf[[index] + ['{}{}'.format(codeName, i)]]
        currDummy = pd.get_dummies(currDummy['{}{}'.format(codeName, i)], prefix = codeName)
        currDummy[index] = cleanDf[index]
        currDummy['{}{}'.format(prcName, i)] = cleanDf['{}{}'.format(prcName, i)]
        currDummy = pd.groupby(currDummy, index)[currDummy.columns].max()
        currDummy2 = currDummy.apply(lambda x: x[:-2] * x[-1], axis = 1)
        currDummy2[index] = currDummy[index]
        dummyDf.append(currDummy2)
    currDummy = dummyDf[0]
    for i in range(1, codeRange):
        currDummy = currDummy.add(dummyDf[i], fill_value = 0.)
        currDummy[index] = dummyDf[i][index]
        currDummy.fillna(0, inplace=True)
    return currDummy

print 'Loaded load'

winddf = winddf[winddf.index < pd.Timestamp('2015-01-01 00:00:00')]
solardf = solardf[solardf.index < pd.Timestamp('2015-01-01 00:00:00')]
loaddf = loaddf[loaddf.index < pd.Timestamp('2015-01-01 00:00:00')]

winddf.index.name = 'Time'
solardf.index.name = 'Time'
loaddf.index.name = 'Time'

winddf.to_csv(outdir + 'wind_signal_ECMWF.csv', float_format='%.4f')
solardf.to_csv(outdir + 'solar_signal_ECMWF.csv', float_format='%.4f')
loaddf.to_csv(outdir + 'load_signal.csv', float_format='%.4f')

raise SystemExit

# # Interesting plots

# Mean production for each hour of the day, relative to yearly mean.
# Increase of ~50% during midday.
(pd.groupby(df, by=df.index.hour).mean()/df.mean(axis=0)).plot(c='k', alpha=0.1)


w = winddf.mean(axis=1)/winddf.mean().mean()
s = solardf.mean(axis=1)/solardf.mean().mean()
l = loaddf.mean(axis=1)/loaddf.mean().mean()

df = pd.DataFrame(data=np.array([a*w-(1-a)*s-l for a in alphas]).T, columns=alphas, index=w.index)
示例#50
0
def _box_reshape(vals, groupby, names, order):
    """Reshape the box/violinplot input options and find plot labels."""

    # Set up default label outputs
    xlabel, ylabel = None, None

    # If order is provided, make sure it was used correctly
    if order is not None:
        # Assure that order is the same length as names, if provided
        if names is not None:
            if len(order) != len(names):
                raise ValueError("`order` must have same length as `names`")
        # Assure that order is only used with the right inputs
        is_pd = isinstance(vals, pd.Series) or isinstance(vals, pd.DataFrame)
        if not is_pd:
            raise ValueError("`vals` must be a Pandas object to use `order`.")

    # Handle case where data is a wide DataFrame
    if isinstance(vals, pd.DataFrame):
        if order is not None:
            vals = vals[order]
        if names is None:
            names = vals.columns.tolist()
        if vals.columns.name is not None:
            xlabel = vals.columns.name
        vals = vals.values.T

    # Handle case where data is a long Series and there is a grouping object
    elif isinstance(vals, pd.Series) and groupby is not None:
        groups = pd.groupby(vals, groupby).groups
        order = sorted(groups) if order is None else order
        if hasattr(groupby, "name"):
            if groupby.name is not None:
                xlabel = groupby.name
        if vals.name is not None:
            ylabel = vals.name
        vals = [vals.reindex(groups[name]) for name in order]
        if names is None:
            names = order

    else:

        # Handle case where the input data is an array or there was no groupby
        if hasattr(vals, 'shape'):
            if len(vals.shape) == 1:
                if np.isscalar(vals[0]):
                    vals = [vals]
                else:
                    vals = list(vals)
            elif len(vals.shape) == 2:
                nr, nc = vals.shape
                if nr == 1:
                    vals = [vals]
                elif nc == 1:
                    vals = [vals.ravel()]
                else:
                    vals = [vals[:, i] for i in range(nc)]
            else:
                error = "Input `vals` can have no more than 2 dimensions"
                raise ValueError(error)

        # This should catch things like flat lists
        elif np.isscalar(vals[0]):
            vals = [vals]

        # By default, just use the plot positions as names
        if names is None:
            names = list(range(1, len(vals) + 1))
        elif hasattr(names, "name"):
            if names.name is not None:
                xlabel = names.name

    # Now convert vals to a common representation
    # The plotting functions will work with a list of arrays
    # The list allows each array to possibly be of a different length
    vals = [np.asarray(a, np.float) for a in vals]

    return vals, xlabel, ylabel, names
示例#51
0
beginning = pd.to_datetime("2014-07-01")
comments = comments[comments['date'].notnull()]
comments = comments[comments['date'] >= beginning]

comments.index=comments['date']
comments.index = comments.index.tz_localize('UTC').tz_convert('US/Central')

# determining gender membership
comments['male'] = [1 if mset.intersection(row.split()) else 0 for row in comments['norm_message']]
comments['female'] = [1 if fset.intersection(row.split()) else 0 for row in comments['norm_message']]
comments['both'] = [1 if row['male'] == 1 and row['female'] == 1 else 0 for (i, row) in comments.iterrows()]
comments['none'] = [0 if row['male'] == 1 or row['female'] == 1 else 1 for (i, row) in comments.iterrows()]

# start of analysis on different categories in the data

grouped = pd.groupby(comments,by=[comments.index.year,comments.index.month])
res = pd.DataFrame(columns=colNames)

for name, group in grouped:
    if name[0] >= 2015 or (name[0] == 2014 and name[1] >=7) :
        print name
        t = agg_groups(group, name)
        res = res.append(t, ignore_index = True)
res.to_csv(runName + 'month_year.csv', sep=',');

grouped = pd.groupby(comments,by=[comments.index.dayofweek,comments.index.hour])
res = pd.DataFrame(columns=colNames)

for name, group in grouped:
    print name
    t = agg_groups(group, name)
                  77177]
rng_seed_list2 = range(9725, 9727+50*7, 7)
rng_seed_list3 = range(9726, 9728+50*7, 7)
rng_seed_list = rng_seed_list1 + rng_seed_list2 + rng_seed_list3
assert len(rng_seed_list) >= NUM_RAND

####### 3. Augment training data #######
data = np.load("./data/processed_train.npy")
obs_ids_all = np.load("./valid/obs_ids_valid_cv%s.npy" % (CV))

data_pd = pd.DataFrame(data=data[:,0:], columns=COLUMNS)
data_pd_ids_all = np.array(data_pd['Id'])
data_pd_ids_selected = np.in1d(data_pd_ids_all, obs_ids_all)
data_pd_filtered = data_pd[data_pd_ids_selected]

data_pd_gp = pd.groupby(data_pd_filtered, "Id")
data_size = len(data_pd_gp)

for jj, rng_seed in enumerate(rng_seed_list[0:NUM_RAND]):
    rng = np.random.RandomState(rng_seed) 
    output = np.empty((data_size, INPUT_WIDTH, 22))
    y_output = np.zeros(data_size)
    
    i = 0
    for _, group in data_pd_gp:
        group_array = np.array(group)
        X = extend_series(group_array[:,1:23], rng, target_len=INPUT_WIDTH) 
        y = group_array[0,23]
        output[i,:,:] = X[:,:]
        y_output[i]= y
        i += 1
示例#53
0
文件: test_api.py 项目: glyg/pandas
 def test_groupby(self):
     with tm.assert_produces_warning(FutureWarning,
                                     check_stacklevel=False):
         pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1])
bench_sharpe_ratio = get_benchmark_from_db(group)[1]

# Get the cumulative return/risk (sharpe ratio) of all strategies for each month
for freq in freqs:
    resultsdir = os.getcwd() + '/synology/%s/RESULTS_%s/' % (group, freq)
    if not os.path.exists(resultsdir + "Ranking/"):
        os.makedirs(resultsdir + "Ranking/")

    strategy_ids = [results.split('/')[-1].split('.')[0] for results in glob.glob(resultsdir + '*.csv')]
    i = 0
    ident = 0
    for strategy_id in strategy_ids:
       	ident += 1
       	#leg = leg.append[ident,strategy_id]
       	returns = pd.read_csv(resultsdir + strategy_id + ".csv", index_col = 'Unnamed: 0', parse_dates = True)['returns']
      	returns_copy1 = pd.groupby(returns,by=[returns.index.year,returns.index.month]).apply(lambda rets: pa.ts_metrics.cum_returns(rets).iloc[-1])
      	st_deviation = pd.groupby(returns, by=[returns.index.year,returns.index.month]).apply(lambda rets: np.std(rets))
      	sharpe_ratio = returns_copy1 / st_deviation
      	strategy_result = pd.concat([returns_copy1.to_frame(), st_deviation.to_frame(), sharpe_ratio.to_frame()], axis = 1) 
      	if (i == 0):
      		sratio = pd.DataFrame(index = strategy_result.index)
      		sratio = pd.concat([sratio, sharpe_ratio.to_frame(name = ident)], axis = 1)
      		cum_returns = pd.DataFrame(index = strategy_result.index)
      		cum_returns = pd.concat([cum_returns, returns_copy1.to_frame(name = ident)], axis = 1)
      		i = 1
      	else:
      		sratio = pd.concat([sratio, sharpe_ratio.to_frame(name = ident)], axis = 1)
      		cum_returns = pd.concat([cum_returns, returns_copy1.to_frame(name = ident)], axis = 1)

bench_returns.name = 14
bench_sharpe_ratio.columns = [14]
# TMAX - maximum daily temperature, Degrees Celsius  
# TMIN - minimum daily temperature, Degrees Celsius  
# AWND - average daily wind speed, meters per seconds
# 
# An example row from the data:

# In[10]:

weather_chicago.head(1)


# You can investigate the weather conditions in Chicago using the plot below. If you click on the legend you turn the data series visibility on/off.

# In[11]:

weather_temp = pd.groupby(weather_chicago, by=[weather_chicago.index.week]).mean()

layout = go.Layout(
    title='Mean values of weather conditions in Chicago per week of the year',
    yaxis=dict(
        rangemode='tozero',
        autorange=True,
        hoverformat='.1f',
        title='[respective units]'
    ),
    xaxis=dict(
        title='Week of the year'
    )
)

data_list = []
def process_weather_stations(weather_stations, path='', frequency='A', \
                           plot_monthly_pattern=False, \
                           plot_yearly_rainfall=False):
    
    assert frequency in ['A', 'M'], "Frequency must be either: 'A' or 'M'"
    
    weather_station_details = {}
    weather_dfs = {}
    
    for station in weather_stations:
        with open(os.path.join(path, station + '.txt'), 'r') as f:
            text = f.read()         
    
            # Get station number:
            station_number = re.search('Patched Point data for station: (\S+)', text).group(1)
    
            # Get station Lat Long which corresponds to GDA94:
            station_latlong = re.search('Lat: (\S+) Long: (\S+)', text).group().strip('"')
    
            # Get elevation of station:
            station_elev = re.search('Elevation:\s+(\w+)', text).group()
            
        weather_station_details[station] = [station_number, station_latlong , station_elev]
    
        #Read in time series data:
        weather_dfs[station] = pd.read_csv(os.path.join(path, station + '.txt'), 
                                           index_col=0, 
                                           skiprows=[41], 
                                           parse_dates=True, 
                                           infer_datetime_format=True, 
                                           delim_whitespace=True, 
                                           comment='"', 
                                           skipinitialspace=True, 
                                           usecols=[0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])

    def cm2inch(*tupl):
        inch = 2.54
        if isinstance(tupl[0], tuple):
            return tuple(i/inch for i in tupl[0])
        else:
            return tuple(i/inch for i in tupl)
    

    def get_rain_and_ET_from_df(df, stations, freq, how='sum'):
        new_df = pd.DataFrame()
        for station in stations:
            if how == 'mean':
                new_df.loc[:, station] = df[station]['Rain'].resample(freq).mean()
                new_df.loc[:, station + '_ET'] = df[station]['Evap'].resample(freq).mean()
            elif how == 'sum':
                new_df.loc[:, station] = df[station]['Rain'].resample(freq).sum()
                new_df.loc[:, station + '_ET'] = df[station]['Evap'].resample(freq).sum()
            # end if
        #end for
        return new_df
        
    annual_weather_df = get_rain_and_ET_from_df(weather_dfs, weather_stations,
                                                'A', how='sum')
    monthly_weather_df = get_rain_and_ET_from_df(weather_dfs, weather_stations,
                                                'M', how='mean') 

    if plot_yearly_rainfall:
        plt.figure(figsize=cm2inch(18,8))
        plt.ylabel("Annual Rainfall [mm]")
        
        for station in weather_stations:
            weather_dfs[station]['Rain'].plot()        
            weather_dfs[station]['Rain'].resample("M", how='sum').plot()    
            weather_dfs[station]['Rain'].resample("A", how='sum'). \
            plot(legend=True, 
                 label=station + ', ' 
                 + weather_station_details[station][0] + ', '  
                 + weather_station_details[station][2] + ', Average: '  
                 + str(weather_dfs[station]['Rain'].resample("A", how='sum').mean())[:5] + 'mm')
            
        plt.xlabel("Year")
        plt.legend(bbox_to_anchor=(0, 1), loc='upper left', ncol=1)

        annual_weather_df.plot(kind='box')
        plt.ylabel("Annual Rainfall [mm]")
        
    if plot_monthly_pattern:
        Months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        month_avg = pd.groupby(monthly_weather_df,by=[monthly_weather_df.index.month]).mean()
        month_avg['Months'] = Months
        
        month_avg.plot(kind='bar',x='Months',y=weather_stations)    
        
        plt.ylabel('Average Monthly Rainfall [mm]')
        plt.xlabel("")
        plt.tight_layout()
        plt.legend(bbox_to_anchor=(0, 1), loc='upper left', ncol=1)

    if frequency == 'A':
        # Keeping this as is for now but should not calculate mean here 
        return annual_weather_df.mean()
    if frequency == 'M':
        return monthly_weather_df
示例#57
0
def genplot(df2,day,pdf=None):
    # Figure out how many dots may appear in one x bin
    max_people_per_bin = pd.groupby(df2,['Team','Location']).count()['Site'].max()

    # Figure out how many teams
    num_teams = len(df2['Team'].unique())
    num_locations = len(df2['Location'].unique())

    # Complete for all days. Consider loop based on 'Day'.unique()
    daydf = df2[df2['Day']==day]

    colors = getColors()

    X = daydf['Site'].values + 0.5
    Y = daydf['TeamMap'].values + 0.5
    day_team_values = daydf['Team'].values
    day_site_values = daydf['Site'].values
    s = []
    for i in range(0,len(X)):
        size = len(daydf[(daydf['Team']==day_team_values[i]) & (daydf['Site']==day_site_values[i])])
        s.append(size)
    #s = [20*4**n for n in range(len(x))]
    #t = [20*4**x for x in s]
    t = [100*x for x in s]
    plt.scatter(X,Y,s=t,alpha=0.5)

    # Fill in the background
    # TODO: colors is conveniently sized to match data, should expand
    for i in range(0,num_locations):
        xgap = [i,i+1]
        plt.fill_between(x=xgap, y1=num_teams, y2=0, color=colors[i], alpha=0.2) 

    # Consider setting the color or alpha here
    for i in range(0,num_teams-1): # don't bother with top line
        plt.axhline(y=i+1,alpha=0.2)

    # Make sure to use the full df not day-specific
    #loc_labels = df2['Location'].unique()
    loc_labels=['Off',
                'Tel',
                'Loc A',
                'Loc B',
                'Loc C',
                'Vacation',
                'Other',
               ]
    plt.xticks(np.arange(len(loc_labels))+0.5,loc_labels)#,rotation=45)

    team_labels=df2['Team'].unique()
    plt.yticks(np.arange(len(team_labels))+0.5,team_labels)

    ax = plt.gca()
    ax.set_autoscale_on(False)
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    plt.tick_params(labelsize=10)
    plt.tick_params(axis='x',top='off')
    plt.tick_params(axis='y',left='off')
    plt.tick_params(axis='y',right='off')

    #plt.title('Locations for %s'%(day))
    #plt.xlabel('Location')
    plt.xlabel('Locations for %s'%(day))
    plt.ylabel('Team')

    plt.xlim(0,num_locations)
    plt.ylim(0,num_teams)

    if pdf:
        pdf.savefig()
        plt.close()
    else:
        plt.show()
print_image_directive(filename, figure)

best_index = numpy.where(scores==numpy.max(scores))
print("   Best Score, {0:.2f}".format(scores[best_index][0]))
print("   max-depth parameter with best score,{0}".format(parameters[best_index][0]))

bin_range = best_models.parameter.max() - best_models.parameter.min()
bins = pandas.cut(best_models.parameter,
                  bin_range)
counts = bins.value_counts()
for bounds in counts.index:
    parameter = bounds.split(',')[0].lstrip('()')
    print('   {0},{1}'.format(int(round(float(parameter))),
                              counts.loc[bounds][0]))

parameter_group = pandas.groupby(best_models, 'parameter')
medians = parameter_group.score.median()
for max_depth in medians.index:
    print('   {0},{1:.2f}'.format(max_depth, medians.loc[max_depth]))

maxes = parameter_group.score.max()
for max_depth in maxes.index:
    print('   {0},{1:.2f}'.format(max_depth, maxes.loc[max_depth]))

best_model = models[best_index[0][0]]
sale_price = best_model.predict(CLIENT_FEATURES)
predicted = sale_price[0] * 1000
actual_median = housing_frame.median_value.median() * 1000
print("   Predicted value of client's home; ${0:,.2f}".format(predicted))
print("   Difference between median and predicted; ${0:,.2f}".format(actual_median - predicted))
示例#59
0
def munge_data(df_orig):
    df = df_orig.copy()
    del df['Person.ID.1']
    # Find the oldest investigator's birth date
    oldest = pd.DataFrame(df.groupby('Grant.Application.ID')['Year.of.Birth.1'].min())


    # Get the number of investigators for each role
    numRole = pd.get_dummies(df['Role.1'])
    numRole['Grant.Application.ID'] = df['Grant.Application.ID']
    numRole = pd.groupby(numRole, 'Grant.Application.ID')[numRole.columns].sum()

    # Get the % of aussies
    numAussies = pd.get_dummies(df['Country.of.Birth.1'])
    numAussies['Grant.Application.ID'] = df['Grant.Application.ID']
    numAussies = pd.groupby(numAussies, 'Grant.Application.ID')[numAussies.columns].sum()

    # We just imputed all values with NaN (no country info) to zero
    prcAussies = pd.DataFrame((numAussies['Australia'] / numAussies.sum(axis = 1)).fillna(0), columns = ['% Australians'])

    # Sum the # of published papers

    numPapers = df.groupby('Grant.Application.ID')['A..1', 'A.1','B.1', 'C.1','Number.of.Successful.Grant.1','Number.of.Unsuccessful.Grant.1'].sum()


    df['Contract.Value.Band...see.note.A'].fillna('A', inplace=True)
    df['Contract.Value.Band...see.note.A']=df['Contract.Value.Band...see.note.A'].apply(lambda x: ord(x.rstrip(' ')))

    # converting categories to dummy variables

    grant_cats = pd.get_dummies(df['Grant.Category.Code'], dummy_na=True)    
    grant_cats['Grant.Application.ID']=df['Grant.Application.ID']
    grant_cats = pd.groupby(grant_cats, 'Grant.Application.ID')[grant_cats.columns].min()  
    grant_cats = pd.DataFrame(grant_cats)

    # imputing missing percentages for RFCD.Percentage columns with the mean
    df['RFCD.Percentage.1'].fillna(df['RFCD.Percentage.1'].mean(), inplace=True)
    df['RFCD.Percentage.2'].fillna(df['RFCD.Percentage.2'].mean(), inplace=True)
    df['RFCD.Percentage.3'].fillna(df['RFCD.Percentage.3'].mean(), inplace=True)
    df['RFCD.Percentage.4'].fillna(df['RFCD.Percentage.4'].mean(), inplace=True)
    df['RFCD.Percentage.5'].fillna(df['RFCD.Percentage.5'].mean(), inplace=True)

    # doing the same as above with SEO.Percentage columns
    df['SEO.Percentage.1'].fillna(df['SEO.Percentage.1'].mean(), inplace=True)
    df['SEO.Percentage.2'].fillna(df['SEO.Percentage.2'].mean(), inplace=True)
    df['SEO.Percentage.3'].fillna(df['SEO.Percentage.3'].mean(), inplace=True)
    df['SEO.Percentage.4'].fillna(df['SEO.Percentage.4'].mean(), inplace=True)
    df['SEO.Percentage.5'].fillna(df['SEO.Percentage.5'].mean(), inplace=True)
    rfcds = combine_columns(df, 'RFCD.Code.', 'RFCD.Percentage.')
    seos = combine_columns(df, 'SEO.Code.', 'SEO.Percentage.')

    # Get rid of everything we don't need
    # REMINDER - LATER COME BACK AND DEAL WITH DEPARTMENT, FACULTY, NO YEARS AT FACULTY, PHD, ETC
    df.drop(['A..1', u'A.1', u'B.1', u'C.1', u'Country.of.Birth.1', u'Dept.No..1', u'Faculty.No..1',
           u'Home.Language.1', u'No..of.Years.in.Uni.at.Time.of.Grant.1', u'Number.of.Successful.Grant.1',
           u'Number.of.Unsuccessful.Grant.1', u'Role.1', u'Sponsor.Code', u'With.PHD.1', u'Year.of.Birth.1',
           u'SEO.Code.4', u'SEO.Code.5', u'SEO.Code.1', u'SEO.Code.2', u'SEO.Code.3', u'RFCD.Code.1',
           u'RFCD.Code.2', u'RFCD.Code.3', u'RFCD.Code.4', u'RFCD.Code.5', 'Grant.Category.Code', u'RFCD.Percentage.1', u'RFCD.Percentage.2', u'RFCD.Percentage.3', u'RFCD.Percentage.4', u'RFCD.Percentage.5', u'SEO.Percentage.1', u'SEO.Percentage.2', u'SEO.Percentage.3', u'SEO.Percentage.4', u'SEO.Percentage.5',], inplace = True, axis = 1)
    df.drop_duplicates(inplace = True)
    df.set_index('Grant.Application.ID', inplace=True)

    finalDf = pd.merge(df, oldest, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, numRole, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, prcAussies, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, numPapers, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, grant_cats, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, rfcds, left_index = True, right_index = True)
    finalDf = pd.merge(finalDf, seos, left_index = True, right_index = True)

    #imputing ages with median
    finalDf['Year.of.Birth.1'] = finalDf['Year.of.Birth.1'].fillna(finalDf['Year.of.Birth.1'].median())

    #imputing missing papers with 0
    finalDf['A..1']=finalDf['A..1'].fillna(0)
    finalDf['A.1']=finalDf['A.1'].fillna(0)
    finalDf['B.1']=finalDf['B.1'].fillna(0)
    finalDf['C.1']=finalDf['C.1'].fillna(0)

    #imputing missing successful and unsuccessful grants with 0
    finalDf['Number.of.Successful.Grant.1']=finalDf['Number.of.Successful.Grant.1'].fillna(0)
    finalDf['Number.of.Unsuccessful.Grant.1']=finalDf['Number.of.Unsuccessful.Grant.1'].fillna(0)

    del finalDf['Grant.Application.ID_y']
    del finalDf['Grant.Application.ID_x']
    finalDf['Proc.Start.Date'] = finalDf['Start.date'].apply(lambda x:
                          time.mktime(datetime.datetime.strptime(x,'%d/%m/%y').timetuple()))
    #splitting dataframe
    #m, b = 4.8261954316943646e-08, -53.597570965226083
    #finalDf['Number.of.Unsuccessful.Grant.1']= finalDf['Number.of.Unsuccessful.Grant.1'] - (m  * finalDf['Proc.Start.Date'] +b)
    
    mask = time_mask(finalDf)
    finalDf_test = finalDf[mask]
    finalDf_train = finalDf[-mask]

    #creating X, y splits for test and train dataframes
    y_train = finalDf_train['Grant.Status'].values
    del finalDf_train['Grant.Status']
    del finalDf_train['Start.date']
    X_train = finalDf_train.values

    y_test = finalDf_test['Grant.Status'].values
    del finalDf_test['Grant.Status']
    del finalDf_test['Start.date']
    X_test = finalDf_test.values

    return X_train, y_train, X_test, y_test, finalDf_test, finalDf_train