def create_duos(self) -> None: """ Método usado para alocar cada dupla em um dia útil do mês """ if self.request.GET.get('novas_duplas'): today = date.today() duplas = generate_duos() dias_mes = get_days(today.month) br = Brazil() index = 0 for dia in range(today.day, dias_mes + 1): if br.is_working_day(date(today.year, today.month, dia)): # Salve as duplas com o dia da limpeza no banco de dados try: Duplas.objects.create( integrante_1=Integrante.objects.get( id=duplas[index][0]), integrante_2=Integrante.objects.get( id=duplas[index][1]), date=date(today.year, today.month, dia)) index += 1 except IndexError: # Se der erro de index volte para o início da tupla com as duplas e redefina o index Duplas.objects.create( integrante_1=Integrante.objects.get( id=duplas[0][0]), integrante_2=Integrante.objects.get( id=duplas[0][1]), date=date(today.year, today.month, dia)) index = 1 except IntegrityError: messages.add_message( self.request, messages.INFO, "As duplas deste mês já foram criadas") break
def getdate2Holiday(d): cal = Brazil() holiday = cal.holidays(d.year) dis, flag = 0, 0 year, month, day = d.year, d.month, d.day d = datetime.date(year, month, day) for i, h in enumerate(holiday): if holiday[i][0] > d: flag = 1 dis = (holiday[i][0] - d).days return dis if flag == 0: dis = (holiday[-1][0] - d).days return dis
def get_cal_from_country(bmk_country): """ Function returning a calendar based on the 'benchmark_country' of the csv file # Python package to manage holidays per country # >> See : https://github.com/peopledoc/workalendar from 'benchmark_country' column (to be parsed) in the Derivation Script Warning : Tuples may appear like [USA, Japon] or [USA, China] instead of China [Germany, China] instead of Russia [USA, China] instead of Moyen-Orient [USA, China] instead of Brasil Currently missing : China, Russia @TODO : ADD HONG-KONG !!! (for 'HSI_Index') NOTE : 5 avril 2018 : Ching Ming Festival (jour férié Hong-Kong !) :param bmk_country: benchmark country (type: string) :return: - cal: calendar related to the country (type: workalendar type ?) """ cal = [] if ',' in bmk_country: # '[A, B]' => ['A', 'B'] print "[WARNING] Tuple for the 'benchmark_country : {}, returning the first one..".format( bmk_country) bmk_country = bmk_country.replace('[', '').replace(']', '').split(',') bmk_country = bmk_country[0] # TO BE DEFINED ! if bmk_country == 'USA': cal = UnitedStates() elif bmk_country == 'Germany': cal = Germany() elif bmk_country == 'Japan': cal = Japan() elif bmk_country == 'France': cal = France() elif bmk_country == 'UK': cal = UnitedKingdom() elif bmk_country == 'Grèce': cal = Greece() elif bmk_country == 'Italie': cal = Italy() elif bmk_country == 'Espagne': cal = Spain() elif bmk_country == 'Brasil': cal = Brazil() return cal
def dia_util(): cal = Brazil() cal.holidays(2019) # dia de hj hoje = datetime.now() ano = hoje.year mes = hoje.month dia = hoje.day #verifica data = ('{0}/{1}/{2}').format(ano, mes, dia) data = datetime.strptime(data, '%Y/%m/%d').date() varind = (cal.is_working_day((data))) # é domingo # Retorna se o dia atual é util verifica_dia = varind #verifica e altera se não for um dia util. while varind == False: data = datetime.fromordinal(data.toordinal() + 1) varind = (cal.is_working_day((data))) # é domingo # coleta o ANO, MES e DIA da data final para inserir no código ano = data.year mes = int(data.month) dia = int(data.day) if mes < 10: mes = '0{0}'.format(mes) if dia < 10: dia = '0{0}'.format(dia) datafinal = '{0}/{1}/{2}'.format(dia, mes, ano) return datafinal, verifica_dia
from workalendar.asia import SouthKorea country_hols['Korea'] = SouthKorea() # from workalendar.asia import India # country_hols['India'] = India() # from workalendar.asia import Thailand # country_hols['Thailand'] = Thailand() # from workalendar.asia import Vietnam # country_hols['Vietnam'] = Vietnam() # from workalendar.asia import Indonesia # country_hols['Indonesia'] = Indonesia() from workalendar.oceania import Australia country_hols['Australia'] = Australia() from workalendar.america import Brazil country_hols['Brazil'] = Brazil() from workalendar.america import Canada country_hols['Canada'] = Canada() from workalendar.america import Mexico country_hols['Mexico'] = Mexico() def add_events(countries, years, my_hols): # the effect of events that change countries (Olympics, ...) is very hard to model # as the interaction impact show at different languages (ie different time series) # each time the country changes and it is very hard to move this impact across languages for ev_name, ev_list in evh.events.items(): for c, drg in ev_list: # country, dates if c in countries: for d in drg: if d.year in years:
def brazil_all_holidays_set(year): """Returns all holidays in brazil with their respective type and coverage""" holidays_set = [] # Get brazilian national holidays cal = Brazil() for national_holidays in cal.holidays(year): holiday_name = national_holidays[1] holiday_date = national_holidays[0] if national_holidays[1] in COMMEMORATIVE_HOLIDAYS: tipo_feriado = 'C' else: tipo_feriado = 'F' holiday_obj = BrazilianHoliday(holiday_name, holiday_date, None, None, 'N', tipo_feriado) if not any(x.nome == holiday_obj.nome for x in holidays_set): holidays_set.append(holiday_obj) # Get brazilian bank holidays cal = BrazilBankCalendar() for bank_holidays in cal.holidays(year): holiday_name = bank_holidays[1] holiday_date = bank_holidays[0] holiday_obj = BrazilianHoliday(holiday_name, holiday_date, None, None, 'N', 'B') if not any(x.nome == holiday_obj.nome for x in holidays_set): holidays_set.append(holiday_obj) # Get holidays from brazilian state for register in IBGE_REGISTER.items(): estado_ibge = re.sub("BR-IBGE-", "", register[0]) if len(estado_ibge) == 2: cal_state = IBGE_REGISTER[register[0]]() for state_holidays in cal_state.holidays(year): holiday_name = state_holidays[1] holiday_date = state_holidays[0] holiday_obj = BrazilianHoliday(holiday_name, holiday_date, estado_ibge, None, 'E', 'F') # Check if is just a state holiday if not any((x.nome == holiday_obj.nome and not x.estado_ibge) for x in holidays_set): holidays_set.append(holiday_obj) # Get brazilian municipal holidays for register in IBGE_REGISTER.items(): municipio_ibge = re.sub("BR-IBGE-", "", register[0]) estado_ibge = municipio_ibge[0:2] if len(municipio_ibge) > 2: cal_city = IBGE_REGISTER[register[0]]() for city_holiday in cal_city.holidays(year): holiday_name = city_holiday[1] holiday_date = city_holiday[0] holiday_obj = BrazilianHoliday(holiday_name, holiday_date, estado_ibge, municipio_ibge, 'M', 'F') # Check if is just a municipal holiday if not any((x.nome == holiday_obj.nome and not x.municipio_ibge) for x in holidays_set): holidays_set.append(holiday_obj) return holidays_set
def create_features(self): #hist_df = pd.read_csv('../input/historical_transactions.csv', nrows=num_rows) hist_df = feather.read_dataframe( '../data/input/historical_transactions.feather') # fillna hist_df['category_2'].fillna(6.0, inplace=True) hist_df['category_3'].fillna('D', inplace=True) #hist_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True) #hist_df['installments'].replace(-1, np.nan,inplace=True) #hist_df['installments'].replace(999, np.nan,inplace=True) # Y/Nのカラムを1-0へ変換 hist_df['authorized_flag'] = hist_df['authorized_flag'].map({ 'Y': 1, 'N': 0 }).astype(int) hist_df['category_1'] = hist_df['category_1'].map({ 'Y': 1, 'N': 0 }).astype(int) #hist_df['category_3'] = hist_df['category_3'].map({'A':0, 'B':1, 'C':2, 'D':3}) # SVD nmf_df_1 = calc_nmf_mat(hist_df, 'card_id', 'city_id') nmf_df_2 = calc_nmf_mat(hist_df, 'card_id', 'state_id') nmf_df_3 = calc_nmf_mat(hist_df, 'card_id', 'subsector_id') nmf_df_4 = calc_nmf_mat(hist_df, 'card_id', 'merchant_category_id') nmf_df_5 = calc_nmf_mat(hist_df, 'card_id', 'installments') nmf_df_6 = calc_nmf_mat(hist_df, 'card_id', 'category_3') nmf_df_7 = calc_nmf_mat(hist_df, 'card_id', 'category_2') # datetime features hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date']) # hist_df['year'] = hist_df['purchase_date'].dt.year hist_df['month'] = hist_df['purchase_date'].dt.month hist_df['day'] = hist_df['purchase_date'].dt.day hist_df['hour'] = hist_df['purchase_date'].dt.hour hist_df['weekofyear'] = hist_df['purchase_date'].dt.weekofyear hist_df['weekday'] = hist_df['purchase_date'].dt.weekday hist_df['weekend'] = (hist_df['purchase_date'].dt.weekday >= 5).astype(int) # additional features hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments'] #ブラジルの休日 cal = Brazil() hist_df['is_holiday'] = hist_df['purchase_date'].dt.date.apply( cal.is_holiday).astype(int) # 購入日からイベント日までの経過日数 #Christmas : December 25 2017 hist_df['Christmas_Day_2017'] = ( pd.to_datetime('2017-12-25') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Mothers Day: May 14 2017 hist_df['Mothers_Day_2017'] = ( pd.to_datetime('2017-06-04') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #fathers day: August 13 2017 hist_df['fathers_day_2017'] = ( pd.to_datetime('2017-08-13') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Childrens day: October 12 2017 hist_df['Children_day_2017'] = ( pd.to_datetime('2017-10-12') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Valentine's Day : 12th June, 2017 hist_df['Valentine_Day_2017'] = ( pd.to_datetime('2017-06-12') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Black Friday : 24th November 2017 hist_df['Black_Friday_2017'] = ( pd.to_datetime('2017-11-24') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #2018 #Mothers Day: May 13 2018 hist_df['Mothers_Day_2018'] = ( pd.to_datetime('2018-05-13') - hist_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) hist_df['month_diff'] = ((datetime.datetime.today() - hist_df['purchase_date']).dt.days) // 30 hist_df['month_diff'] += hist_df['month_lag'] # additional features hist_df[ 'duration'] = hist_df['purchase_amount'] * hist_df['month_diff'] hist_df['amount_month_ratio'] = hist_df['purchase_amount'] / hist_df[ 'month_diff'] # memory usage削減 <この部分で謎のエラーが出るのでコメントアウト pyarrow.lib.ArrowNotImplementedError: halffloat #hist_df = reduce_mem_usage(hist_df) col_unique = ['subsector_id', 'merchant_id', 'merchant_category_id'] col_seas = ['month', 'hour', 'weekofyear', 'day'] aggs = {} for col in col_unique: aggs[col] = ['nunique'] for col in col_seas: aggs[col] = ['nunique', 'mean', 'min', 'max'] aggs['purchase_amount'] = ['sum', 'max', 'min', 'mean', 'var', 'skew'] #aggs['installments'] = ['sum','max','mean','var','skew'] aggs['purchase_date'] = ['max', 'min'] aggs['month_lag'] = ['max', 'min', 'mean', 'var', 'skew'] aggs['month_diff'] = ['max', 'min', 'mean', 'var', 'skew'] aggs['authorized_flag'] = ['mean'] aggs['weekend'] = ['mean', 'max'] aggs['weekday'] = ['nunique', 'mean'] # overwrite aggs['category_1'] = ['mean'] #aggs['category_2'] = ['mean'] #aggs['category_3'] = ['mean'] aggs['card_id'] = ['size', 'count'] aggs['is_holiday'] = ['mean'] aggs['price'] = ['sum', 'mean', 'max', 'min', 'var', 'skew'] aggs['Christmas_Day_2017'] = ['mean'] aggs['Mothers_Day_2017'] = ['mean'] aggs['fathers_day_2017'] = ['mean'] aggs['Children_day_2017'] = ['mean'] aggs['Valentine_Day_2017'] = ['mean'] aggs['Black_Friday_2017'] = ['mean'] aggs['Mothers_Day_2018'] = ['mean'] aggs['duration'] = ['mean', 'min', 'max', 'var', 'skew'] aggs['amount_month_ratio'] = ['mean', 'min', 'max', 'var', 'skew'] """ for col in ['category_2','category_3']: hist_df[col+'_mean'] = hist_df.groupby([col])['purchase_amount'].transform('mean') hist_df[col+'_min'] = hist_df.groupby([col])['purchase_amount'].transform('min') hist_df[col+'_max'] = hist_df.groupby([col])['purchase_amount'].transform('max') hist_df[col+'_sum'] = hist_df.groupby([col])['purchase_amount'].transform('sum') aggs[col+'_mean'] = ['mean'] """ hist_df = hist_df.reset_index().groupby('card_id').agg(aggs) # カラム名の変更 hist_df.columns = pd.Index( [e[0] + "_" + e[1] for e in hist_df.columns.tolist()]) hist_df.columns = ['hist_' + c for c in hist_df.columns] hist_df['hist_purchase_date_diff'] = ( hist_df['hist_purchase_date_max'] - hist_df['hist_purchase_date_min']).dt.days hist_df['hist_purchase_date_average'] = hist_df[ 'hist_purchase_date_diff'] / hist_df['hist_card_id_size'] hist_df['hist_purchase_date_uptonow'] = ( datetime.datetime.today() - hist_df['hist_purchase_date_max']).dt.days hist_df['hist_purchase_date_uptomin'] = ( datetime.datetime.today() - hist_df['hist_purchase_date_min']).dt.days hist_df = pd.merge(hist_df, nmf_df_1, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_2, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_3, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_4, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_5, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_6, on=['card_id'], how='left') hist_df = pd.merge(hist_df, nmf_df_7, on=['card_id'], how='left') #hist_df = hist_df.reset_index() train_df = feather.read_dataframe( '../features/traintest_train.feather') test_df = feather.read_dataframe('../features/traintest_test.feather') df = pd.concat([train_df, test_df], axis=0) init_cols = df.columns hist_df = pd.merge(df, hist_df, on='card_id', how='outer') hist_df_train = hist_df[hist_df['target'].notnull()] hist_df_test = hist_df[hist_df['target'].isnull()] hist_df_train = hist_df_train.drop(init_cols, axis=1) hist_df_test = hist_df_test.drop(init_cols, axis=1) #self.df = hist_df self.train = hist_df_train.reset_index(drop=True) self.test = hist_df_test.reset_index(drop=True)
def create_features(self): # load csv #new_merchant_df = pd.read_csv('../input/new_merchant_transactions.csv', nrows=num_rows) new_merchant_df = feather.read_dataframe( '../data/input/new_merchant_transactions.feather') # fillna new_merchant_df['category_2'].fillna(1.0, inplace=True) new_merchant_df['category_3'].fillna('A', inplace=True) new_merchant_df['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True) new_merchant_df['installments'].replace(-1, np.nan, inplace=True) new_merchant_df['installments'].replace(999, np.nan, inplace=True) # Y/Nのカラムを1-0へ変換 new_merchant_df['authorized_flag'] = new_merchant_df[ 'authorized_flag'].map({ 'Y': 1, 'N': 0 }).astype(int) new_merchant_df['category_1'] = new_merchant_df['category_1'].map({ 'Y': 1, 'N': 0 }).astype(int) new_merchant_df['category_3'] = new_merchant_df['category_3'].map({ 'A': 0, 'B': 1, 'C': 2 }).astype(int) # datetime features new_merchant_df['purchase_date'] = pd.to_datetime( new_merchant_df['purchase_date']) # new_merchant_df['year'] = new_merchant_df['purchase_date'].dt.year new_merchant_df['month'] = new_merchant_df['purchase_date'].dt.month new_merchant_df['day'] = new_merchant_df['purchase_date'].dt.day new_merchant_df['hour'] = new_merchant_df['purchase_date'].dt.hour new_merchant_df['weekofyear'] = new_merchant_df[ 'purchase_date'].dt.weekofyear new_merchant_df['weekday'] = new_merchant_df[ 'purchase_date'].dt.weekday new_merchant_df['weekend'] = ( new_merchant_df['purchase_date'].dt.weekday >= 5).astype(int) # additional features new_merchant_df['price'] = new_merchant_df[ 'purchase_amount'] / new_merchant_df['installments'] #ブラジルの休日 cal = Brazil() # new_merchant_df['is_holiday'] = new_merchant_df['purchase_date'].dt.date.apply(cal.is_holiday).astype(int) # 購入日からイベント日までの経過日数 #Christmas : December 25 2017 new_merchant_df['Christmas_Day_2017'] = ( pd.to_datetime('2017-12-25') - new_merchant_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Mothers Day: May 14 2017 # new_merchant_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #fathers day: August 13 2017 new_merchant_df['fathers_day_2017'] = ( pd.to_datetime('2017-08-13') - new_merchant_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Childrens day: October 12 2017 new_merchant_df['Children_day_2017'] = ( pd.to_datetime('2017-10-12') - new_merchant_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Valentine's Day : 12th June, 2017 # new_merchant_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #Black Friday : 24th November 2017 new_merchant_df['Black_Friday_2017'] = ( pd.to_datetime('2017-11-24') - new_merchant_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) #2018 #Mothers Day: May 13 2018 new_merchant_df['Mothers_Day_2018'] = ( pd.to_datetime('2018-05-13') - new_merchant_df['purchase_date'] ).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0) new_merchant_df['month_diff'] = ( (datetime.datetime.today() - new_merchant_df['purchase_date']).dt.days) // 30 new_merchant_df['month_diff'] += new_merchant_df['month_lag'] # additional features new_merchant_df['duration'] = new_merchant_df[ 'purchase_amount'] * new_merchant_df['month_diff'] new_merchant_df['amount_month_ratio'] = new_merchant_df[ 'purchase_amount'] / new_merchant_df['month_diff'] # memory usage削減 #new_merchant_df = reduce_mem_usage(new_merchant_df) col_unique = ['subsector_id', 'merchant_id', 'merchant_category_id'] col_seas = ['hour', 'weekofyear', 'weekday', 'day'] aggs = {} for col in col_unique: aggs[col] = ['nunique'] for col in col_seas: aggs[col] = ['nunique', 'mean', 'min', 'max'] aggs['purchase_amount'] = ['sum', 'max', 'min', 'mean', 'var', 'skew'] aggs['installments'] = ['sum', 'max', 'mean', 'var', 'skew'] aggs['purchase_date'] = ['max', 'min'] aggs['month_lag'] = ['max', 'min', 'mean', 'var', 'skew'] aggs['month_diff'] = ['mean', 'var', 'skew'] # aggs['authorized_flag'] = ['mean'] aggs['weekend'] = ['mean'] aggs['month'] = ['mean', 'min', 'max'] aggs['category_1'] = ['mean', 'min'] aggs['category_2'] = ['mean', 'min'] aggs['category_3'] = ['mean', 'min'] aggs['card_id'] = ['size', 'count'] # aggs['is_holiday'] = ['mean'] aggs['price'] = ['mean', 'max', 'min', 'var', 'skew'] aggs['Christmas_Day_2017'] = ['mean'] # aggs['Mothers_Day_2017'] = ['mean'] aggs['fathers_day_2017'] = ['mean'] aggs['Children_day_2017'] = ['mean'] # aggs['Valentine_Day_2017'] = ['mean'] aggs['Black_Friday_2017'] = ['mean'] aggs['Mothers_Day_2018'] = ['mean'] aggs['duration'] = ['mean', 'min', 'max', 'var', 'skew'] aggs['amount_month_ratio'] = ['mean', 'min', 'max', 'var', 'skew'] for col in ['category_2', 'category_3']: new_merchant_df[col + '_mean'] = new_merchant_df.groupby( [col])['purchase_amount'].transform('mean') new_merchant_df[col + '_min'] = new_merchant_df.groupby( [col])['purchase_amount'].transform('min') new_merchant_df[col + '_max'] = new_merchant_df.groupby( [col])['purchase_amount'].transform('max') new_merchant_df[col + '_sum'] = new_merchant_df.groupby( [col])['purchase_amount'].transform('sum') aggs[col + '_mean'] = ['mean'] new_merchant_df = new_merchant_df.reset_index().groupby('card_id').agg( aggs) # カラム名の変更 new_merchant_df.columns = pd.Index( [e[0] + "_" + e[1] for e in new_merchant_df.columns.tolist()]) new_merchant_df.columns = ['new_' + c for c in new_merchant_df.columns] new_merchant_df['new_purchase_date_diff'] = ( new_merchant_df['new_purchase_date_max'] - new_merchant_df['new_purchase_date_min']).dt.days new_merchant_df['new_purchase_date_average'] = new_merchant_df[ 'new_purchase_date_diff'] / new_merchant_df['new_card_id_size'] new_merchant_df['new_purchase_date_uptonow'] = ( datetime.datetime.today() - new_merchant_df['new_purchase_date_max']).dt.days new_merchant_df['new_purchase_date_uptomin'] = ( datetime.datetime.today() - new_merchant_df['new_purchase_date_min']).dt.days # memory usage削減 #new_merchant_df = reduce_mem_usage(new_merchant_df) #new_merchant_df = new_merchant_df.reset_index() train_df = feather.read_dataframe( '../features/traintest_train.feather') test_df = feather.read_dataframe('../features/traintest_test.feather') df = pd.concat([train_df, test_df], axis=0) init_cols = df.columns new_merchant_df = pd.merge(df, new_merchant_df, on='card_id', how='outer') new_merchant_df_train = new_merchant_df[ new_merchant_df['target'].notnull()] new_merchant_df_test = new_merchant_df[ new_merchant_df['target'].isnull()] new_merchant_df_train = new_merchant_df_train.drop(init_cols, axis=1) new_merchant_df_test = new_merchant_df_test.drop(init_cols, axis=1) self.train = new_merchant_df_train.reset_index(drop=True) self.test = new_merchant_df_test.reset_index(drop=True)
# -*- coding: utf-8 -*- """ Created on Wed Aug 7 16:37:25 2019 @author: Ramon.Nascimento """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import date, datetime, timezone from workalendar.america import Brazil cal = Brazil() dados = pd.read_csv('./transactions.csv') #transformando a data dados.data = pd.to_datetime(dados.data, format='%d/%m/%Y') dados.dia = (dados['data']).dt.day dados.mes = (dados['data']).dt.month dados.ano = (dados['data']).dt.year dados.diaAno = (dados['data']).dt.dayofyear dados.dtypes dados.corr().round(4)
def update_db(db_dir: str = r'investments_database.db'): """Updates the database.\n <b>Parameters:</b>\n db_dir (str): The path of the dabatabse file to be updated. Defaults to 'investments_database.db'.\n <b>Returns:</b>\n Theres no return from the function. """ ##STEP 1 #connects to the database print(f'connected with the database {db_dir}\n') con = sqlite3.connect(db_dir) ##STEP 2 #calculates relevant date limits to the update process Cal=Brazil() #inicializes the brazillian calendar today = datetime.date.today() #queries the last update from the log table last_update = pd.to_datetime(pd.read_sql('select MAX(date) from update_log', con).iloc[0,0]) last_quota = Cal.sub_working_days(last_update, 2) #date of the last published cvm repport num_months = (today.year - last_quota.year) * 12 + (today.month - last_quota.month) + 1 ##STEP 3 #delete information that will be updated from the database tables print('deleting redundant data from the database... \n') tables = {'daily_quotas' : ['DT_COMPTC',last_quota.strftime("%Y-%m-01")], 'ibov_returns' : ['Date',last_update.strftime("%Y-%m-%d")]} cursor = con.cursor() #sql delete statement to the database cursor.execute('delete from daily_quotas where DT_COMPTC >= :date', {'date': last_quota.strftime("%Y-%m-01")}) cursor.execute('delete from ibov_returns where Date >= :date', {'date': last_update.strftime("%Y-%m-%d")}) con.commit() cursor.close() ##STEP 4 #Pulls new data from CVM, investpy and the brazilian central bank #and pushes it to the database try:#tries to read targets funds if they were specified when starting the database target_funds = pd.read_sql('select targets from target_funds', con).targets.to_list() except DatabaseError: target_funds = [] print('downloading new daily reports from the CVM website...\n') # downloads the daily cvm repport for each month between the last update and today for m in range(num_months+1): data_alvo = last_quota + relativedelta(months=+m) informe = cvm_informes(data_alvo.year, data_alvo.month) if target_funds: informe = informe[informe.CNPJ_FUNDO.isin(target_funds)] try: informe.to_sql('daily_quotas', con , if_exists = 'append', index=False) except AttributeError: pass #downloads cadastral information from CVM of the fundos and pushes it to the database print('downloading updated cadastral information from cvm...\n') info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1', dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object, 'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object, 'CONTROLADOR': object} ) if target_funds: #filters target funds if they were specified when building the database. info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)] info_cad.to_sql('info_cadastral_funds', con, if_exists='replace', index=False) #updates daily interest returns (selic) print('updating selic rates...\n') selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11)) selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y') selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value #calculates asset "price" considering day 0 price as 1 selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor']) for i in range(1, len(selic)): selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor']) selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True) #filters only new data selic = selic[selic.date>=(last_update + datetime.timedelta(-1))] selic.to_sql('selic_rates', con , if_exists = 'append', index=False) #updates ibovespa data print('updating ibovespa returns...\n') today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d') ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data(last_update.strftime('%Y-%m-%d'), today, 'daily')['^BVSP']['prices']) ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]] ibov['date'] = pd.to_datetime(ibov['date']) ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy) ibov.to_sql('ibov_returns', con , if_exists = 'append', index=False) ##STEP 5 #updates the log in the database print('updating the log...\n') update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]}) update_log.to_sql('update_log', con, if_exists = 'append', index=False) ##STEP 6 #closes the connection with the database con.close() print('connection with the database closed!\n') print(f'database {db_dir} updated!\n')