Пример #1
0
 def create_duos(self) -> None:
     """ Método usado para alocar cada dupla em um dia útil do mês """
     if self.request.GET.get('novas_duplas'):
         today = date.today()
         duplas = generate_duos()
         dias_mes = get_days(today.month)
         br = Brazil()
         index = 0
         for dia in range(today.day, dias_mes + 1):
             if br.is_working_day(date(today.year, today.month, dia)):
                 # Salve as duplas com o dia da limpeza no banco de dados
                 try:
                     Duplas.objects.create(
                         integrante_1=Integrante.objects.get(
                             id=duplas[index][0]),
                         integrante_2=Integrante.objects.get(
                             id=duplas[index][1]),
                         date=date(today.year, today.month, dia))
                     index += 1
                 except IndexError:
                     # Se der erro de index volte para o início da tupla com as duplas e redefina o index
                     Duplas.objects.create(
                         integrante_1=Integrante.objects.get(
                             id=duplas[0][0]),
                         integrante_2=Integrante.objects.get(
                             id=duplas[0][1]),
                         date=date(today.year, today.month, dia))
                     index = 1
                 except IntegrityError:
                     messages.add_message(
                         self.request, messages.INFO,
                         "As duplas deste mês já foram criadas")
                     break
Пример #2
0
 def getdate2Holiday(d):
     cal = Brazil()
     holiday = cal.holidays(d.year)
     dis, flag = 0, 0
     year, month, day = d.year, d.month, d.day
     d = datetime.date(year, month, day)
     for i, h in enumerate(holiday):
         if holiday[i][0] > d:
             flag = 1
             dis = (holiday[i][0] - d).days
             return dis
     if flag == 0:
         dis = (holiday[-1][0] - d).days
     return dis
Пример #3
0
def get_cal_from_country(bmk_country):
    """
    Function returning a calendar based on the 'benchmark_country' of the csv file
    # Python package to manage holidays per country
    # >> See : https://github.com/peopledoc/workalendar
    from 'benchmark_country' column (to be parsed) in the Derivation Script
    Warning : Tuples may appear like [USA, Japon] or [USA, China] instead of China
    [Germany, China] instead of Russia
    [USA, China] instead of Moyen-Orient
    [USA, China] instead of Brasil
    Currently missing : China, Russia
    @TODO : ADD HONG-KONG !!! (for 'HSI_Index')
    NOTE :  5 avril 2018 : Ching Ming Festival (jour férié Hong-Kong !)
    :param bmk_country: benchmark country (type: string)
    :return:
        - cal: calendar related to the country (type: workalendar type ?)
    """
    cal = []
    if ',' in bmk_country:  # '[A, B]' => ['A', 'B']
        print "[WARNING] Tuple for the 'benchmark_country : {}, returning the first one..".format(
            bmk_country)
        bmk_country = bmk_country.replace('[', '').replace(']', '').split(',')
        bmk_country = bmk_country[0]  # TO BE DEFINED !

    if bmk_country == 'USA':
        cal = UnitedStates()
    elif bmk_country == 'Germany':
        cal = Germany()
    elif bmk_country == 'Japan':
        cal = Japan()
    elif bmk_country == 'France':
        cal = France()
    elif bmk_country == 'UK':
        cal = UnitedKingdom()
    elif bmk_country == 'Grèce':
        cal = Greece()
    elif bmk_country == 'Italie':
        cal = Italy()
    elif bmk_country == 'Espagne':
        cal = Spain()
    elif bmk_country == 'Brasil':
        cal = Brazil()
    return cal
Пример #4
0
def dia_util():
    cal = Brazil()
    cal.holidays(2019)

    # dia de hj
    hoje = datetime.now()

    ano = hoje.year
    mes = hoje.month
    dia = hoje.day

    #verifica
    data = ('{0}/{1}/{2}').format(ano, mes, dia)
    data = datetime.strptime(data, '%Y/%m/%d').date()

    varind = (cal.is_working_day((data)))  # é domingo

    # Retorna se o dia atual é util
    verifica_dia = varind

    #verifica e altera se não for um dia util.

    while varind == False:
        data = datetime.fromordinal(data.toordinal() + 1)
        varind = (cal.is_working_day((data)))  # é domingo

    # coleta o ANO, MES e DIA  da data final para inserir no código
    ano = data.year
    mes = int(data.month)
    dia = int(data.day)

    if mes < 10:
        mes = '0{0}'.format(mes)

    if dia < 10:
        dia = '0{0}'.format(dia)

    datafinal = '{0}/{1}/{2}'.format(dia, mes, ano)

    return datafinal, verifica_dia
Пример #5
0
from workalendar.asia import SouthKorea
country_hols['Korea'] = SouthKorea()
# from workalendar.asia import India
# country_hols['India'] = India()
# from workalendar.asia import Thailand
# country_hols['Thailand'] = Thailand()
# from workalendar.asia import Vietnam
# country_hols['Vietnam'] = Vietnam()
# from workalendar.asia import Indonesia
# country_hols['Indonesia'] = Indonesia()

from workalendar.oceania import Australia
country_hols['Australia'] = Australia()

from workalendar.america import Brazil
country_hols['Brazil'] = Brazil()
from workalendar.america import Canada
country_hols['Canada'] = Canada()
from workalendar.america import Mexico
country_hols['Mexico'] = Mexico()


def add_events(countries, years, my_hols):
    # the effect of events that change countries (Olympics, ...) is very hard to model
    # as the interaction impact show at different languages (ie different time series)
    # each time the country changes and it is very hard to move this impact across languages
    for ev_name, ev_list in evh.events.items():
        for c, drg in ev_list:  # country, dates
            if c in countries:
                for d in drg:
                    if d.year in years:
Пример #6
0
def brazil_all_holidays_set(year):
    """Returns all holidays in brazil
     with their respective type and coverage"""

    holidays_set = []

    # Get brazilian national holidays
    cal = Brazil()
    for national_holidays in cal.holidays(year):
        holiday_name = national_holidays[1]
        holiday_date = national_holidays[0]

        if national_holidays[1] in COMMEMORATIVE_HOLIDAYS:
            tipo_feriado = 'C'
        else:
            tipo_feriado = 'F'
        holiday_obj = BrazilianHoliday(holiday_name, holiday_date, None, None,
                                       'N', tipo_feriado)
        if not any(x.nome == holiday_obj.nome for x in holidays_set):
            holidays_set.append(holiday_obj)

    # Get brazilian bank holidays
    cal = BrazilBankCalendar()
    for bank_holidays in cal.holidays(year):
        holiday_name = bank_holidays[1]
        holiday_date = bank_holidays[0]

        holiday_obj = BrazilianHoliday(holiday_name, holiday_date, None, None,
                                       'N', 'B')
        if not any(x.nome == holiday_obj.nome for x in holidays_set):
            holidays_set.append(holiday_obj)

    # Get holidays from brazilian state
    for register in IBGE_REGISTER.items():
        estado_ibge = re.sub("BR-IBGE-", "", register[0])
        if len(estado_ibge) == 2:

            cal_state = IBGE_REGISTER[register[0]]()
            for state_holidays in cal_state.holidays(year):
                holiday_name = state_holidays[1]
                holiday_date = state_holidays[0]

                holiday_obj = BrazilianHoliday(holiday_name, holiday_date,
                                               estado_ibge, None, 'E', 'F')

                # Check if is just a state holiday
                if not any((x.nome == holiday_obj.nome and
                            not x.estado_ibge) for x in holidays_set):
                    holidays_set.append(holiday_obj)

    # Get brazilian municipal holidays
    for register in IBGE_REGISTER.items():
        municipio_ibge = re.sub("BR-IBGE-", "", register[0])
        estado_ibge = municipio_ibge[0:2]
        if len(municipio_ibge) > 2:

            cal_city = IBGE_REGISTER[register[0]]()

            for city_holiday in cal_city.holidays(year):
                holiday_name = city_holiday[1]
                holiday_date = city_holiday[0]

                holiday_obj = BrazilianHoliday(holiday_name, holiday_date,
                                               estado_ibge,
                                               municipio_ibge,
                                               'M', 'F')

                # Check if is just a municipal holiday
                if not any((x.nome == holiday_obj.nome and
                            not x.municipio_ibge) for x in holidays_set):
                    holidays_set.append(holiday_obj)
    return holidays_set
    def create_features(self):
        #hist_df = pd.read_csv('../input/historical_transactions.csv', nrows=num_rows)
        hist_df = feather.read_dataframe(
            '../data/input/historical_transactions.feather')

        # fillna
        hist_df['category_2'].fillna(6.0, inplace=True)
        hist_df['category_3'].fillna('D', inplace=True)
        #hist_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
        #hist_df['installments'].replace(-1, np.nan,inplace=True)
        #hist_df['installments'].replace(999, np.nan,inplace=True)

        # Y/Nのカラムを1-0へ変換
        hist_df['authorized_flag'] = hist_df['authorized_flag'].map({
            'Y': 1,
            'N': 0
        }).astype(int)
        hist_df['category_1'] = hist_df['category_1'].map({
            'Y': 1,
            'N': 0
        }).astype(int)
        #hist_df['category_3'] = hist_df['category_3'].map({'A':0, 'B':1, 'C':2, 'D':3})

        # SVD
        nmf_df_1 = calc_nmf_mat(hist_df, 'card_id', 'city_id')
        nmf_df_2 = calc_nmf_mat(hist_df, 'card_id', 'state_id')
        nmf_df_3 = calc_nmf_mat(hist_df, 'card_id', 'subsector_id')
        nmf_df_4 = calc_nmf_mat(hist_df, 'card_id', 'merchant_category_id')
        nmf_df_5 = calc_nmf_mat(hist_df, 'card_id', 'installments')
        nmf_df_6 = calc_nmf_mat(hist_df, 'card_id', 'category_3')
        nmf_df_7 = calc_nmf_mat(hist_df, 'card_id', 'category_2')

        # datetime features
        hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
        #    hist_df['year'] = hist_df['purchase_date'].dt.year
        hist_df['month'] = hist_df['purchase_date'].dt.month
        hist_df['day'] = hist_df['purchase_date'].dt.day
        hist_df['hour'] = hist_df['purchase_date'].dt.hour
        hist_df['weekofyear'] = hist_df['purchase_date'].dt.weekofyear
        hist_df['weekday'] = hist_df['purchase_date'].dt.weekday
        hist_df['weekend'] = (hist_df['purchase_date'].dt.weekday >=
                              5).astype(int)

        # additional features
        hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']

        #ブラジルの休日
        cal = Brazil()
        hist_df['is_holiday'] = hist_df['purchase_date'].dt.date.apply(
            cal.is_holiday).astype(int)

        # 購入日からイベント日までの経過日数
        #Christmas : December 25 2017
        hist_df['Christmas_Day_2017'] = (
            pd.to_datetime('2017-12-25') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Mothers Day: May 14 2017
        hist_df['Mothers_Day_2017'] = (
            pd.to_datetime('2017-06-04') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #fathers day: August 13 2017
        hist_df['fathers_day_2017'] = (
            pd.to_datetime('2017-08-13') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Childrens day: October 12 2017
        hist_df['Children_day_2017'] = (
            pd.to_datetime('2017-10-12') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Valentine's Day : 12th June, 2017
        hist_df['Valentine_Day_2017'] = (
            pd.to_datetime('2017-06-12') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Black Friday : 24th November 2017
        hist_df['Black_Friday_2017'] = (
            pd.to_datetime('2017-11-24') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

        #2018
        #Mothers Day: May 13 2018
        hist_df['Mothers_Day_2018'] = (
            pd.to_datetime('2018-05-13') - hist_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

        hist_df['month_diff'] = ((datetime.datetime.today() -
                                  hist_df['purchase_date']).dt.days) // 30
        hist_df['month_diff'] += hist_df['month_lag']

        # additional features
        hist_df[
            'duration'] = hist_df['purchase_amount'] * hist_df['month_diff']
        hist_df['amount_month_ratio'] = hist_df['purchase_amount'] / hist_df[
            'month_diff']

        # memory usage削減 <この部分で謎のエラーが出るのでコメントアウト pyarrow.lib.ArrowNotImplementedError: halffloat
        #hist_df = reduce_mem_usage(hist_df)

        col_unique = ['subsector_id', 'merchant_id', 'merchant_category_id']
        col_seas = ['month', 'hour', 'weekofyear', 'day']

        aggs = {}
        for col in col_unique:
            aggs[col] = ['nunique']

        for col in col_seas:
            aggs[col] = ['nunique', 'mean', 'min', 'max']

        aggs['purchase_amount'] = ['sum', 'max', 'min', 'mean', 'var', 'skew']
        #aggs['installments'] = ['sum','max','mean','var','skew']
        aggs['purchase_date'] = ['max', 'min']
        aggs['month_lag'] = ['max', 'min', 'mean', 'var', 'skew']
        aggs['month_diff'] = ['max', 'min', 'mean', 'var', 'skew']
        aggs['authorized_flag'] = ['mean']
        aggs['weekend'] = ['mean', 'max']
        aggs['weekday'] = ['nunique', 'mean']  # overwrite
        aggs['category_1'] = ['mean']
        #aggs['category_2'] = ['mean']
        #aggs['category_3'] = ['mean']
        aggs['card_id'] = ['size', 'count']
        aggs['is_holiday'] = ['mean']
        aggs['price'] = ['sum', 'mean', 'max', 'min', 'var', 'skew']
        aggs['Christmas_Day_2017'] = ['mean']
        aggs['Mothers_Day_2017'] = ['mean']
        aggs['fathers_day_2017'] = ['mean']
        aggs['Children_day_2017'] = ['mean']
        aggs['Valentine_Day_2017'] = ['mean']
        aggs['Black_Friday_2017'] = ['mean']
        aggs['Mothers_Day_2018'] = ['mean']
        aggs['duration'] = ['mean', 'min', 'max', 'var', 'skew']
        aggs['amount_month_ratio'] = ['mean', 'min', 'max', 'var', 'skew']
        """
        for col in ['category_2','category_3']:
            hist_df[col+'_mean'] = hist_df.groupby([col])['purchase_amount'].transform('mean')
            hist_df[col+'_min'] = hist_df.groupby([col])['purchase_amount'].transform('min')
            hist_df[col+'_max'] = hist_df.groupby([col])['purchase_amount'].transform('max')
            hist_df[col+'_sum'] = hist_df.groupby([col])['purchase_amount'].transform('sum')
            aggs[col+'_mean'] = ['mean']
        """

        hist_df = hist_df.reset_index().groupby('card_id').agg(aggs)

        # カラム名の変更
        hist_df.columns = pd.Index(
            [e[0] + "_" + e[1] for e in hist_df.columns.tolist()])
        hist_df.columns = ['hist_' + c for c in hist_df.columns]

        hist_df['hist_purchase_date_diff'] = (
            hist_df['hist_purchase_date_max'] -
            hist_df['hist_purchase_date_min']).dt.days
        hist_df['hist_purchase_date_average'] = hist_df[
            'hist_purchase_date_diff'] / hist_df['hist_card_id_size']
        hist_df['hist_purchase_date_uptonow'] = (
            datetime.datetime.today() -
            hist_df['hist_purchase_date_max']).dt.days
        hist_df['hist_purchase_date_uptomin'] = (
            datetime.datetime.today() -
            hist_df['hist_purchase_date_min']).dt.days

        hist_df = pd.merge(hist_df, nmf_df_1, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_2, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_3, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_4, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_5, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_6, on=['card_id'], how='left')
        hist_df = pd.merge(hist_df, nmf_df_7, on=['card_id'], how='left')

        #hist_df = hist_df.reset_index()

        train_df = feather.read_dataframe(
            '../features/traintest_train.feather')
        test_df = feather.read_dataframe('../features/traintest_test.feather')
        df = pd.concat([train_df, test_df], axis=0)
        init_cols = df.columns
        hist_df = pd.merge(df, hist_df, on='card_id', how='outer')

        hist_df_train = hist_df[hist_df['target'].notnull()]
        hist_df_test = hist_df[hist_df['target'].isnull()]

        hist_df_train = hist_df_train.drop(init_cols, axis=1)
        hist_df_test = hist_df_test.drop(init_cols, axis=1)

        #self.df = hist_df
        self.train = hist_df_train.reset_index(drop=True)
        self.test = hist_df_test.reset_index(drop=True)
    def create_features(self):
        # load csv
        #new_merchant_df = pd.read_csv('../input/new_merchant_transactions.csv', nrows=num_rows)
        new_merchant_df = feather.read_dataframe(
            '../data/input/new_merchant_transactions.feather')

        # fillna
        new_merchant_df['category_2'].fillna(1.0, inplace=True)
        new_merchant_df['category_3'].fillna('A', inplace=True)
        new_merchant_df['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
        new_merchant_df['installments'].replace(-1, np.nan, inplace=True)
        new_merchant_df['installments'].replace(999, np.nan, inplace=True)

        # Y/Nのカラムを1-0へ変換
        new_merchant_df['authorized_flag'] = new_merchant_df[
            'authorized_flag'].map({
                'Y': 1,
                'N': 0
            }).astype(int)
        new_merchant_df['category_1'] = new_merchant_df['category_1'].map({
            'Y':
            1,
            'N':
            0
        }).astype(int)
        new_merchant_df['category_3'] = new_merchant_df['category_3'].map({
            'A':
            0,
            'B':
            1,
            'C':
            2
        }).astype(int)

        # datetime features
        new_merchant_df['purchase_date'] = pd.to_datetime(
            new_merchant_df['purchase_date'])
        #    new_merchant_df['year'] = new_merchant_df['purchase_date'].dt.year
        new_merchant_df['month'] = new_merchant_df['purchase_date'].dt.month
        new_merchant_df['day'] = new_merchant_df['purchase_date'].dt.day
        new_merchant_df['hour'] = new_merchant_df['purchase_date'].dt.hour
        new_merchant_df['weekofyear'] = new_merchant_df[
            'purchase_date'].dt.weekofyear
        new_merchant_df['weekday'] = new_merchant_df[
            'purchase_date'].dt.weekday
        new_merchant_df['weekend'] = (
            new_merchant_df['purchase_date'].dt.weekday >= 5).astype(int)

        # additional features
        new_merchant_df['price'] = new_merchant_df[
            'purchase_amount'] / new_merchant_df['installments']

        #ブラジルの休日
        cal = Brazil()
        #    new_merchant_df['is_holiday'] = new_merchant_df['purchase_date'].dt.date.apply(cal.is_holiday).astype(int)

        # 購入日からイベント日までの経過日数
        #Christmas : December 25 2017
        new_merchant_df['Christmas_Day_2017'] = (
            pd.to_datetime('2017-12-25') - new_merchant_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Mothers Day: May 14 2017
        #    new_merchant_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #fathers day: August 13 2017
        new_merchant_df['fathers_day_2017'] = (
            pd.to_datetime('2017-08-13') - new_merchant_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Childrens day: October 12 2017
        new_merchant_df['Children_day_2017'] = (
            pd.to_datetime('2017-10-12') - new_merchant_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Valentine's Day : 12th June, 2017
        #    new_merchant_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
        #Black Friday : 24th November 2017
        new_merchant_df['Black_Friday_2017'] = (
            pd.to_datetime('2017-11-24') - new_merchant_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

        #2018
        #Mothers Day: May 13 2018
        new_merchant_df['Mothers_Day_2018'] = (
            pd.to_datetime('2018-05-13') - new_merchant_df['purchase_date']
        ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

        new_merchant_df['month_diff'] = (
            (datetime.datetime.today() -
             new_merchant_df['purchase_date']).dt.days) // 30
        new_merchant_df['month_diff'] += new_merchant_df['month_lag']

        # additional features
        new_merchant_df['duration'] = new_merchant_df[
            'purchase_amount'] * new_merchant_df['month_diff']
        new_merchant_df['amount_month_ratio'] = new_merchant_df[
            'purchase_amount'] / new_merchant_df['month_diff']

        # memory usage削減
        #new_merchant_df = reduce_mem_usage(new_merchant_df)

        col_unique = ['subsector_id', 'merchant_id', 'merchant_category_id']
        col_seas = ['hour', 'weekofyear', 'weekday', 'day']

        aggs = {}
        for col in col_unique:
            aggs[col] = ['nunique']

        for col in col_seas:
            aggs[col] = ['nunique', 'mean', 'min', 'max']

        aggs['purchase_amount'] = ['sum', 'max', 'min', 'mean', 'var', 'skew']
        aggs['installments'] = ['sum', 'max', 'mean', 'var', 'skew']
        aggs['purchase_date'] = ['max', 'min']
        aggs['month_lag'] = ['max', 'min', 'mean', 'var', 'skew']
        aggs['month_diff'] = ['mean', 'var', 'skew']
        #    aggs['authorized_flag'] = ['mean']
        aggs['weekend'] = ['mean']
        aggs['month'] = ['mean', 'min', 'max']
        aggs['category_1'] = ['mean', 'min']
        aggs['category_2'] = ['mean', 'min']
        aggs['category_3'] = ['mean', 'min']
        aggs['card_id'] = ['size', 'count']
        #    aggs['is_holiday'] = ['mean']
        aggs['price'] = ['mean', 'max', 'min', 'var', 'skew']
        aggs['Christmas_Day_2017'] = ['mean']
        #    aggs['Mothers_Day_2017'] = ['mean']
        aggs['fathers_day_2017'] = ['mean']
        aggs['Children_day_2017'] = ['mean']
        #    aggs['Valentine_Day_2017'] = ['mean']
        aggs['Black_Friday_2017'] = ['mean']
        aggs['Mothers_Day_2018'] = ['mean']
        aggs['duration'] = ['mean', 'min', 'max', 'var', 'skew']
        aggs['amount_month_ratio'] = ['mean', 'min', 'max', 'var', 'skew']

        for col in ['category_2', 'category_3']:
            new_merchant_df[col + '_mean'] = new_merchant_df.groupby(
                [col])['purchase_amount'].transform('mean')
            new_merchant_df[col + '_min'] = new_merchant_df.groupby(
                [col])['purchase_amount'].transform('min')
            new_merchant_df[col + '_max'] = new_merchant_df.groupby(
                [col])['purchase_amount'].transform('max')
            new_merchant_df[col + '_sum'] = new_merchant_df.groupby(
                [col])['purchase_amount'].transform('sum')
            aggs[col + '_mean'] = ['mean']

        new_merchant_df = new_merchant_df.reset_index().groupby('card_id').agg(
            aggs)

        # カラム名の変更
        new_merchant_df.columns = pd.Index(
            [e[0] + "_" + e[1] for e in new_merchant_df.columns.tolist()])
        new_merchant_df.columns = ['new_' + c for c in new_merchant_df.columns]

        new_merchant_df['new_purchase_date_diff'] = (
            new_merchant_df['new_purchase_date_max'] -
            new_merchant_df['new_purchase_date_min']).dt.days
        new_merchant_df['new_purchase_date_average'] = new_merchant_df[
            'new_purchase_date_diff'] / new_merchant_df['new_card_id_size']
        new_merchant_df['new_purchase_date_uptonow'] = (
            datetime.datetime.today() -
            new_merchant_df['new_purchase_date_max']).dt.days
        new_merchant_df['new_purchase_date_uptomin'] = (
            datetime.datetime.today() -
            new_merchant_df['new_purchase_date_min']).dt.days

        # memory usage削減
        #new_merchant_df = reduce_mem_usage(new_merchant_df)

        #new_merchant_df = new_merchant_df.reset_index()

        train_df = feather.read_dataframe(
            '../features/traintest_train.feather')
        test_df = feather.read_dataframe('../features/traintest_test.feather')
        df = pd.concat([train_df, test_df], axis=0)
        init_cols = df.columns
        new_merchant_df = pd.merge(df,
                                   new_merchant_df,
                                   on='card_id',
                                   how='outer')

        new_merchant_df_train = new_merchant_df[
            new_merchant_df['target'].notnull()]
        new_merchant_df_test = new_merchant_df[
            new_merchant_df['target'].isnull()]

        new_merchant_df_train = new_merchant_df_train.drop(init_cols, axis=1)
        new_merchant_df_test = new_merchant_df_test.drop(init_cols, axis=1)

        self.train = new_merchant_df_train.reset_index(drop=True)
        self.test = new_merchant_df_test.reset_index(drop=True)
Пример #9
0
# -*- coding: utf-8 -*-
"""
Created on Wed Aug  7 16:37:25 2019

@author: Ramon.Nascimento
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, datetime, timezone
from workalendar.america import Brazil
cal = Brazil()

dados = pd.read_csv('./transactions.csv')

#transformando a data
dados.data = pd.to_datetime(dados.data, format='%d/%m/%Y')

dados.dia = (dados['data']).dt.day
dados.mes = (dados['data']).dt.month
dados.ano = (dados['data']).dt.year
dados.diaAno = (dados['data']).dt.dayofyear

dados.dtypes

dados.corr().round(4)
Пример #10
0
def update_db(db_dir: str = r'investments_database.db'):
    """Updates the database.\n

    <b>Parameters:</b>\n
    db_dir (str): The path of the dabatabse file to be updated. Defaults to 'investments_database.db'.\n

    <b>Returns:</b>\n
    Theres no return from the function.

   """
    ##STEP 1
    #connects to the database
    print(f'connected with the database {db_dir}\n')
    con = sqlite3.connect(db_dir)


    ##STEP 2
    #calculates relevant date limits to the update process
    Cal=Brazil() #inicializes the brazillian calendar
    today = datetime.date.today()

    #queries the last update from the log table
    last_update = pd.to_datetime(pd.read_sql('select MAX(date) from update_log', con).iloc[0,0])

    last_quota = Cal.sub_working_days(last_update, 2) #date of the last published cvm repport
    num_months = (today.year - last_quota.year) * 12 + (today.month - last_quota.month) + 1


    ##STEP 3
    #delete information that will be updated from the database tables
    print('deleting redundant data from the database... \n')
    tables = {'daily_quotas' : ['DT_COMPTC',last_quota.strftime("%Y-%m-01")],
              'ibov_returns' : ['Date',last_update.strftime("%Y-%m-%d")]}
    
    cursor = con.cursor()
    
    #sql delete statement to the database
    cursor.execute('delete from daily_quotas where DT_COMPTC >= :date', {'date': last_quota.strftime("%Y-%m-01")})
    cursor.execute('delete from ibov_returns where Date >= :date', {'date': last_update.strftime("%Y-%m-%d")})
        
    con.commit()  
    cursor.close()


    ##STEP 4
    #Pulls new data from CVM, investpy and the brazilian central bank
    #and pushes it to the database

    try:#tries to read targets funds if they were specified when starting the database
        target_funds = pd.read_sql('select targets from target_funds', con).targets.to_list()
    except DatabaseError:
        target_funds = []
    
    print('downloading new daily reports from the CVM website...\n')
    # downloads the daily cvm repport for each month between the last update and today
    for m in range(num_months+1): 
        data_alvo = last_quota + relativedelta(months=+m) 
        informe = cvm_informes(data_alvo.year, data_alvo.month)
        if target_funds:
            informe = informe[informe.CNPJ_FUNDO.isin(target_funds)]
        try:
            informe.to_sql('daily_quotas', con , if_exists = 'append', index=False)
        except AttributeError:
            pass 

    #downloads cadastral information from CVM of the fundos and pushes it to the database
    print('downloading updated cadastral information from cvm...\n')
    info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1',
                           dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object,
                                    'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object,
                                    'CONTROLADOR': object}
                            )
    if target_funds: #filters target funds if they were specified when building the database.
        info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)]
    info_cad.to_sql('info_cadastral_funds', con, if_exists='replace', index=False)

    #updates daily interest returns (selic)
    print('updating selic rates...\n')
    selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11))
    selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y')
    selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value

    #calculates asset "price" considering day 0 price as 1
    selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor'])
    for i in range(1, len(selic)):
        selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor'])

    selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True)

    #filters only new data
    selic = selic[selic.date>=(last_update + datetime.timedelta(-1))]
    selic.to_sql('selic_rates', con , if_exists = 'append', index=False) 

    #updates ibovespa data
    print('updating ibovespa returns...\n')
    today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d')
    ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data(last_update.strftime('%Y-%m-%d'), today, 'daily')['^BVSP']['prices'])
    ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]]
    ibov['date'] = pd.to_datetime(ibov['date'])
    ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy)
    ibov.to_sql('ibov_returns', con , if_exists = 'append', index=False)

    ##STEP 5
    #updates the log in the database
    print('updating the log...\n')
    update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]})
    update_log.to_sql('update_log', con, if_exists = 'append', index=False)


    ##STEP 6
    #closes the connection with the database
    con.close()
    print('connection with the database closed!\n')

    print(f'database {db_dir} updated!\n')