def clean_google_fit_summary(): # ===== Config ======== gfit_file = r'\Takeout\Fit\Daily Aggregations\Daily Summaries.csv' fit_agg_path = root_dir + gfit_file # fit_agg_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\Takeout\Fit\Daily Aggregations\Daily Summaries.csv' # my_data_master= # ===== Google fit agg collection Cleaning ======== cols = ['Date', 'Calories (kcal)', 'Step count', 'Cycling duration (ms)' , 'Inactive duration (ms)', 'Walking duration (ms)', 'Running duration (ms)'] g_fit = pd.read_csv(fit_agg_path, usecols=cols) # Fixing date g_fit['Date'] = pd.to_datetime(g_fit['Date']) # Reversing index for future g_fit = g_fit.reindex(index=g_fit.index[::-1]) g_fit = g_fit.reset_index() g_fit = g_fit.drop('index', axis=1) # === Filling empty values with 0 === g_fit = g_fit.fillna(value=0, axis='columns') # print(g_fit.head()) # print(g_fit['Step count'].head()) # === Fixing columns (Dtypes) === dtype_map = {'Calories (kcal)': float, 'Step count': int, 'Inactive duration (ms)': int, 'Cycling duration (ms)': int, 'Walking duration (ms)': int, 'Running duration (ms)': int} # g_fit['Calories (kcal)'] = g_fit['Calories (kcal)'].astype(float) g_fit['Step count'] = g_fit['Step count'].astype(np.int64) g_fit['Inactive duration (ms)'] = g_fit['Inactive duration (ms)'].astype(np.int64) g_fit['Cycling duration (ms)'] = g_fit['Cycling duration (ms)'].astype(np.int64) g_fit['Walking duration (ms)'] = g_fit['Walking duration (ms)'].astype(np.int64) g_fit['Running duration (ms)'] = g_fit['Running duration (ms)'].astype(np.int64) # print(g_fit.dtypes) # print(g_fit.columns) # print(g_fit.head()) # Dropping all the dates already in database tbl_name = daily_config['google_fit']['tbl_name'] g_fit = cutil.get_only_new_data_df(g_fit, tbl_name) # ===== Daylio saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') g_fit.to_sql('google_fit', conn, if_exists='append') # conn.close()
def clean_pomodoro_excel(): # ===== Config ======== excel_pomodoro_path = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro' # ===== Excel collection of multiple excel files ======== excel_files = [] excel_pomodoro_total_daily = pd.DataFrame(columns=['Date', 'pomo_total']) excel_pomo = pd.DataFrame(columns=['Date', 'subject', 'subject_none']) excel_noo = pd.DataFrame(columns=['Date', 'nootropic']) # =for dev purpose only= # excel_single_file_last = r'11-13.xlsx' # excel_single_file_first = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\04-11.xlsx' # excel_single_file_first = r'04-11.xlsx' excel_single_file_custom = r'05-24.xlsx' # ===== Reading, processing Excel data for SQLDB ======== # for file in [excel_single_file_custom]: for file in os.listdir(excel_pomodoro_path): # for file in dev_smaller_files: try: print('file reading: {} \n'.format(file)) xl = pd.ExcelFile(excel_pomodoro_path + "\\" + file) #xl = pd.ExcelFile(file) #dev purpose only of reading single file. #10-12.xlsx is start of nootropic. df_day = xl.parse( "Sheet1", names=['time', 'subject', 'subject_none', 'nootropic'], parse_cols="B,C,D,E") # For older when nootropic wasn't there. if df_day['nootropic'].iloc[0] != 'Nootropics': df_day = df_day.drop('nootropic', axis=1) # Gets rid of any empty data columns picked up on bottom of sheet, and excel-label at top. # If there's no time, there's no data. df_day = df_day.dropna(axis=0, how='any', subset=['time']) df_day = df_day[ df_day['time'] != 'Time'] # Gets rid of excel-label but dropna is seems already doing it. But dropped index require this. Do i need drop index? df_day = df_day.reset_index(drop=True) # 1. excel_pomodoro_total_daily # Here for now I simplify to get total pomo per day, instead of having detailed day look df_day_temp = df_day.dropna(axis=0, how='any', subset=['subject']) #for now adding file names in simplified way day_sum = {'Date': [file], 'pomo_total': [len(df_day_temp)]} df_day_sum = pd.DataFrame(day_sum, columns=['Date', 'pomo_total']) excel_pomodoro_total_daily = excel_pomodoro_total_daily.append( df_day_sum) # print('\n\n') # 2. excel_raw, excel_noo # Add date column cur_month_day = file.replace('.xlsx', '') cur_year = os.path.basename(os.path.dirname( excel_pomodoro_path)) #This needs to be changed later cur_date = f'{cur_year}-{cur_month_day} ' df_day['time'] = df_day['time'].astype(str) df_day['Date'] = cur_date + df_day['time'] df_day['record_date'] = cur_date #2x + df_day['time'] # Fixes cases where time goes past midnight so I have next date. # Added the recorded_date for today's pomodoro filters in SQL for future uses. try: df_day['Date'] = pd.to_datetime(df_day['Date'], format="%Y-%m-%d %H:%M:%S") #2xdf_day['record_date'] = pd.to_datetime(df_day['record_date'], format="%Y-%m-%d ") except ValueError: next_day = False for index, row in df_day.iterrows(): if next_day is False: if row['time'] == '1900-01-01 00:00:00': # print('ah... next day') next_day = True cur_date_datetime = datetime.strptime( cur_date, "%Y-%m-%d ") tmr_date_datetime = cur_date_datetime + timedelta( days=1) tmr_date_str = tmr_date_datetime.strftime( "%Y-%m-%d %H:%M:%S") df_day.loc[index, 'Date'] = tmr_date_str #2xdf_day.loc[index, 'record_date'] = cur_date_datetime continue row['Date'] = cur_date + row['time'] # row['record_date'] = cur_date elif next_day is True: # print('next_day stuff') cur_date_datetime = datetime.strptime( cur_date, "%Y-%m-%d ") tmr_date_datetime = cur_date_datetime + timedelta( days=1) tmr_date_str = tmr_date_datetime.strftime( "%Y-%m-%d %H:%M:%S") df_day.loc[index, 'Date'] = tmr_date_str #2xdf_day.loc[index, 'record_date'] = cur_date_datetime df_day['Date'] = pd.to_datetime(df_day['Date'], format="%Y-%m-%d %H:%M:%S") #2xdf_day['record_date'] = pd.to_datetime(df_day['record_date'], format="%Y-%m-%d ") # prep for excel_pomo df_pomo = df_day[[ 'Date', 'record_date', 'subject', 'subject_none' ]] # Add each day to bigger df excel_pomo = excel_pomo.append(df_pomo) if 'nootropic' in df_day.columns: df_noo = df_day[['Date', 'nootropic']] df_noo = df_noo.dropna(axis=0, how='any', subset=['nootropic' ]) # only keep time with data excel_noo = excel_noo.append(df_noo) except IndexError: print("index error on file: {}".format(file)) print("\nall files read") # For now assume it's only 2017 date. But later use folder to smartly decide. Probably in different stage of file. excel_pomodoro_total_daily['pomo_total'] = excel_pomodoro_total_daily[ 'pomo_total'].astype(int) excel_pomodoro_total_daily['Date'] = excel_pomodoro_total_daily[ 'Date'].str.replace('.xlsx', '') excel_pomodoro_total_daily[ 'Date'] = "2017-" + excel_pomodoro_total_daily['Date'] excel_pomodoro_total_daily['Date'] = pd.to_datetime( excel_pomodoro_total_daily['Date'], format="%Y-%m-%d") # print(excel_pomodoro_total_daily) # print(excel_pomodoro_total_daily.dtypes) # print(excel_pomo.head()) # print(excel_pomo.tail()) # # print(excel_noo.head()) # print(excel_noo.tail()) # TODO: IMPORTANT!!! When appending ID is no longer going to be unique?????????????????? I think?????????????? Should I use SQLALCHEMY??? # For now since it's getting it from start if I change below from append to remaking whole data it'll be ok. excel_pomo = excel_pomo.reset_index() excel_pomo = excel_pomo.drop('index', axis=1) print(excel_pomo) # Dropping all the dates already in database # TODO: DEL? # tbl_name_daily = daily_config['pomo_excel_daily']['tbl_name'] # excel_pomodoro_total_daily = cutil.get_only_new_data_df(excel_pomodoro_total_daily, tbl_name_daily) tbl_name_pomo = daily_config['excel_pomodoro']['tbl_name'] # Don't need for "if_exists='replace'" # excel_pomo = cutil.get_only_new_data_df(excel_pomo, tbl_name_pomo) tbl_name_noo = daily_config['excel_nootropic']['tbl_name'] excel_noo = cutil.get_only_new_data_df(excel_noo, tbl_name_noo) # ===== Excel pomo saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') # TODO: DEL? excel_pomodoro_total_daily.to_sql('pomo_excel_daily', conn, if_exists='append') excel_pomo.to_sql(tbl_name_pomo, conn, if_exists='replace') excel_noo.to_sql(tbl_name_noo, conn, if_exists='append') conn.close()
def clean_weather(): # ===== Config ======== # data_root = weather_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\daily-text' # my_data_master= # ===== Weight collection of multiple csvs ======== weather_files = [] weather = pd.DataFrame() names = [ 'Time', 'Temp', 'Humid', 'DewPt', 'Press', 'WindSp', 'WindDr', 'Sun', 'Rain', 'Start', 'MxWSpd' ] drop_row_range = [i for i in range(8)] # For dev purpose, doing whole thing with smaller set of files. dev_smaller_files = os.listdir(weather_path) dev_smaller_files = dev_smaller_files[-50:-1] # ===== Reading CSV + Weight collection Cleaning ======== for file in os.listdir(weather_path): # for file in dev_smaller_files: try: print('file reading: {} \n'.format(file)) weather_monthly = pd.read_csv(weather_path + "\\" + file, sep='\t', names=names) weather_monthly = weather_monthly.drop( weather_monthly.index[drop_row_range]) # date is in file name itself so this needs to be added here. weather_monthly['DateOnly'] = file # print(weather_monthly['DateOnly'].head()) weather_monthly['DateOnly'] = pd.to_datetime( weather_monthly['DateOnly'], format="%Y_%m_%d") # print(weather_monthly['DateOnly'].head()) # TODO: Bug somewhere in data conversion. something to do with certain file or way it's processing. weather_monthly['Date'] = file + " " + weather_monthly['Time'] # print(weather_monthly['Date'].head()) weather_monthly['Date'] = pd.to_datetime(weather_monthly['Date'], format="%Y_%m_%d %H:%M") # print(weather_monthly['Date'].head()) # print(weather_monthly['Temp']) # print(weather_monthly) weather_monthly['Temp'] = weather_monthly['Temp'].astype( np.float64) weather = weather.append(weather_monthly) print('\n\n') except IndexError: print("index error on file: {}".format(file)) print("Might have no data for this date") print("all files read") # === Fixing columns (Dtypes) === weather = weather.drop('Time', axis=1) # doing it in loop to catch bug # try: # weather['Date'] = pd.to_datetime(weather['Date'], format="%Y_%m_%d %H:%M") # except: # print("Error in date conversion") # try: # weather['DateOnly'] = pd.to_datetime(weather['DateOnly'], format="%Y_%m_%d") # except: # print("Error in DAteOnly conversion") # weather['Temp'] = weather['Temp'].astype(np.float64) weather['Humid'] = weather['Humid'].astype(np.int64) weather['DewPt'] = weather['DewPt'].astype(np.float64) weather['Press'] = weather['Press'].astype(np.int64) weather['WindSp'] = weather['WindSp'].astype(np.float64) weather['Sun'] = weather['Sun'].astype(np.float64) weather['Rain'] = weather['Rain'].astype(np.float64) weather['MxWSpd'] = weather['MxWSpd'].astype(np.int64) # == Column ordering with date first == columns_order = [ 'DateOnly', 'Date', 'Temp', 'Humid', 'DewPt', 'Press', 'WindSp', 'WindDr', 'Sun', 'Rain', 'Start', 'MxWSpd' ] weather = weather[columns_order] # Reversing index for future weather = weather.sort_values('Date', axis=0) # weight = weight.reindex(index=weight.index[::-1]) weather = weather.reset_index() weather = weather.drop('index', axis=1) #========================= # ====== Creating daily Database #========================= weather_hourly = weather.copy() # print(weather_hourly.head(10)) weather_hourly = weather_hourly.groupby('DateOnly') # print(weather_hourly.head(10)) weather_daily = weather_hourly.agg({ 'Temp': ['mean', 'min', 'max'], 'Humid': 'mean', 'Sun': 'max', 'Rain': 'max' }) # print(weather_daily.head(2)) # weather_daily = weather_daily.rename(columns={ # weather_daily.columns = weather_daily.columns.droplevel(0) # Making 2 leveled columns into 1 again, but adding level 2 first. (e.g temp_max from just max) weather_daily.columns = [ "_".join(x) for x in weather_daily.columns.ravel() ] weather_daily = weather_daily.rename(columns={ 'Sun_max': 'Sun_total', 'Rain_max': 'Rain_total' }) weather_daily = weather_daily.round(1) weather_daily = weather_daily.reset_index() # print(weather_daily.columns) # Naming to Date weather_daily['Date'] = weather_daily['DateOnly'] weather_daily = weather_daily.drop('DateOnly', axis=1) col_reorder = [ 'Date', 'Temp_mean', 'Temp_min', 'Temp_max', 'Humid_mean', 'Sun_total', 'Rain_total' ] weather_daily = weather_daily[col_reorder] # print(weather_daily.columns) # print(weather_daily.head(22)) # print("=======") # # print(weather_hourly.shape) # print("=======") # print(weather_hourly.head()) # print("=======") # print(weather_hourly.tail()) # print("=======") # print(weather_hourly.dtypes) # Dropping all the dates already in database tbl_name = daily_config['weather_daily']['tbl_name'] weather_daily = cutil.get_only_new_data_df(weather_daily, tbl_name) # ===== weight saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') # weather.to_sql('weather_hourly', conn) #, if_exists='append' weather_daily.to_sql('weather_daily', conn, if_exists='append') conn.close()
def clean_pomodoro_kanban(): # ===== Config ======== # data_root = kanban_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\Today board.csv' # my_data_master= # ===== Kanban collection Cleaning ======== # cols = ['Name', 'Color', 'Time spent', 'Labels', 'Comments', 'Grouping date', 'Created timestamp'] cols = ['Time spent', 'Grouping date'] dtype = {'Time spent': np.float64} kanban = pd.read_csv(kanban_path, usecols=cols, dtype=dtype) # Fixing date kanban['Date'] = pd.to_datetime(kanban['Grouping date'], format='%Y-%m-%d') kanban.drop(['Grouping date'], inplace=True, axis=1) # Finding total pomo per day kanban = kanban.groupby('Date') kanban = kanban.sum() kanban = kanban.reset_index() # kanban['Time spent'] = kanban['Time spent'].round(2) # kanban['Time spent'] = kanban['Time spent'].astype(str) # print(kanban) # print(kanban['Time spent'].str.split('.').tolist()) # kanban_temp = pd.DataFrame(kanban['Time spent'].str.split('.').tolist(), columns = ['Hour', 'Min']) """ kanban = pd.concat([kanban, kanban_temp], axis=1) kanban['Hour'] = kanban['Hour'].astype(np.int) kanban['Min'] = kanban['Min'].astype(np.int) kanban['Min2'] = (kanban['Min'] * 60) % 60 kanban['pomo_total'] = kanban['Hour']*60 + kanban['Min'] """ def dec_norm(time): hours = int(time) minutes = (time * 60) % 60 seconds = (time * 3600) % 60 # print("%d:%02d:%02d" % (hours, minutes, seconds)) total_min = hours * 60 + minutes return total_min #Covert decimal time to normal time kanban['Total Min'] = kanban['Time spent'].apply(dec_norm) kanban['pomo_total'] = kanban['Total Min'] / 26 # Round to nearest kanban['pomo_total'] = kanban['pomo_total'].round(0) kanban = kanban.drop(['Time spent', 'Total Min'], axis=1) # print(kanban) # Dropping all the dates already in database tbl_name = daily_config['pomo_kanban_daily']['tbl_name'] kanban = cutil.get_only_new_data_df(kanban, tbl_name) # ===== Kanban saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') kanban.to_sql('pomo_kanban_daily', conn, if_exists='append') conn.close()
def clean_weight(): # ===== Config ======== fitbit_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\weight' # ===== Weight collection of multiple csvs ======== weight_files = [] weight = pd.DataFrame() #fitbit_path+"\\"+"fitbit_export_20170725 (1).csv" names = ['Date', 'Weight', 'BMI', 'Fat'] # Not parsing yet # ===== Reading CSV + Weight collection Cleaning ======== for file in os.listdir(fitbit_path): weight_montly = pd.read_csv( fitbit_path + "\\" + file, names=names, dtype={'Date': object}) #, parse_dates=['Date'], date_parser=date_parser weight_montly = weight_montly.drop(weight_montly.index[[0, 1]]) try: weight_montly['Date'] = pd.to_datetime(weight_montly['Date'], format="%d-%m-%Y") except: print(file) print( 'Problem in parsing date %d-%m-%Y format. Trying to Parse with 2nd format.' ) try: weight_montly['Date'] = pd.to_datetime(weight_montly['Date'], format="%d/%m/%Y") print('parsed date in "%d/%m/%Y format instead') except: print('DATE ISSUE: error in parsing date %d-%m-%Y format.') weight = weight.append(weight_montly) ### # === Fixing columns (Dtypes) === weight['Weight'] = weight['Weight'].astype(np.float64) weight['BMI'] = weight['BMI'].astype(np.float64) weight['Fat'] = weight['Fat'].astype(np.float64) # Reversing index for future weight = weight.sort_values('Date', axis=0) # weight = weight.reindex(index=weight.index[::-1]) weight = weight.reset_index() weight = weight.drop('index', axis=1) # print("=======") # print(weight.shape) # print("=======") # print(weight.head(3)) # print("=======") # print(weight.tail()) # print("=======") # print(weight.dtypes) # print(weight['Date'].head(3)) """ # ==== Fill in with estimate missing dates ===== print("=======") print("=======") # print(weight['Date'].iloc[0]) # print(weight['Date'].iloc[len(weight)-1]) # weight = weight.groupby('Date') # weight = weight['Date'].resample print(weight.head(10)) start_date = weight['Date'].iloc[0] latest_date = weight['Date'].iloc[len(weight)-1] all_dates = pd.date_range(start_date, latest_date) weight = pd.DatetimeIndex(weight.index) weight = weight.reindex(all_dates, fill_value="NaN") print("=======") print(weight.shape) print(weight.head(10)) print("=======") """ # Dropping all the dates already in database tbl_name = daily_config['weight']['tbl_name'] weight = cutil.get_only_new_data_df(weight, tbl_name) # ===== weight saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') weight.to_sql('weight', conn, if_exists='append') conn.close()
def clean_daylio(): # ===== Config ======== daylio_file = r'\daylio_export.csv' path_daylio = root_dir + daylio_file # path_daylio = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\1_current\daylio_export.csv' # my_data_master= # ===== Daylio collection Cleaning ======== daylio = pd.read_csv(path_daylio) # Fixing date daylio['year'] = daylio['year'].apply(str) daylio['date-str'] = daylio['date'] + ' ' + daylio['year'] + ' ' + daylio['time'] daylio['date-datetime'] = pd.to_datetime(daylio['date-str']) # Dropping unrequired columns cols_drop_after_date = ['year', 'date', 'weekday', 'time', 'date-str'] daylio = daylio.drop(cols_drop_after_date, axis=1) # Normalizing mood mood_map = {'awful': 0, 'fugly': 2.5, 'meh': 5, 'good': 7.5, 'rad': 10} daylio['mood'] = daylio['mood'].map(mood_map) # print(daylio.head()) # print(daylio.dtypes) # Make date to each day daylio['Date'] = daylio['date-datetime'].dt.strftime('%Y-%m-%d') daylio = daylio.drop('date-datetime', axis=1) daylio = daylio[['Date', 'mood']] # Making average of the day from any hour of the day # WARNING: It drops other non-mean-able column automatically. But there's not much data on those so it's ok for now. daylio = daylio.groupby('Date') daylio = daylio.mean() daylio = daylio.reset_index() # Making date to date format again. # # print(daylio['Date'].head()) # print(pd.api.types.is_string_dtype(daylio['Date'])) daylio['Date'] = pd.to_datetime(daylio['Date'], format="%Y-%m-%d") # print(daylio.dtypes) """ # Just checking there's no duplicate dates now. dates_counts = daylio['Date'].value_counts() dates_counts = dates_counts[dates_counts != 1] print("Below should be empty if there are no duplicate dates") print(dates_counts) """ # print(daylio.head(55)) # Dropping all the dates already in database tbl_name = daily_config['mood']['tbl_name'] daylio = cutil.get_only_new_data_df(daylio, tbl_name) # Reversing index order # Not needed since index reset """ daylio = daylio.reindex(index=daylio.index[::-1]) daylio = daylio.reset_index() daylio = daylio.drop('index', axis=1) """ # ===== Daylio saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') daylio.to_sql('mood', conn, if_exists='append') # conn.close()
def clean_pomodoro_excel(): # ===== Config ======== excel_pomodoro_path = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro' # ===== Excel collection of multiple excel files ======== excel_files = [] excel_pomodoro_total_daily = pd.DataFrame(columns=['Date', 'pomo_total']) # =for dev purpose only= # excel_single_file_last = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\08-23.xlsx' # excel_single_file_first = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\04-11.xlsx' # ===== Reading CSV + Weight collection Cleaning ======== # for file in [excel_single_file_last]: for file in os.listdir(excel_pomodoro_path): # for file in dev_smaller_files: try: print('file reading: {} \n'.format(file)) xl = pd.ExcelFile(excel_pomodoro_path + "\\" + file) #xl = pd.ExcelFile(file) #dev purpose only of reading single file. df_day = xl.parse("Sheet1", names=['time', 'subject', 'subject_non'], parse_cols="B,C,D") df_day = df_day.dropna(axis=0, how='any', subset=['time']) df_day = df_day[df_day['time'] != 'Time'] df_day = df_day.reset_index(drop=True) # Here for now I simplify to get total pomo per day, instead of having detailed day look df_day_temp = df_day.dropna(axis=0, how='any', subset=['subject']) #for now adding file names in simplified way day_sum = {'Date': [file], 'pomo_total': [len(df_day_temp)]} df_day_sum = pd.DataFrame(day_sum, columns=['Date', 'pomo_total']) excel_pomodoro_total_daily = excel_pomodoro_total_daily.append( df_day_sum) # print('\n\n') except IndexError: print("index error on file: {}".format(file)) print("all files read") # For now assume it's only 2017 date. But later use folder to smartly decide. Probably in different stage of file. excel_pomodoro_total_daily['pomo_total'] = excel_pomodoro_total_daily[ 'pomo_total'].astype(int) excel_pomodoro_total_daily['Date'] = excel_pomodoro_total_daily[ 'Date'].str.replace('.xlsx', '') excel_pomodoro_total_daily[ 'Date'] = "2017-" + excel_pomodoro_total_daily['Date'] excel_pomodoro_total_daily['Date'] = pd.to_datetime( excel_pomodoro_total_daily['Date'], format="%Y-%m-%d") # print(excel_pomodoro_total_daily) # print(excel_pomodoro_total_daily.dtypes) # Dropping all the dates already in database tbl_name = daily_config['pomo_excel_daily']['tbl_name'] excel_pomodoro_total_daily = cutil.get_only_new_data_df( excel_pomodoro_total_daily, tbl_name) # ===== Excel pomo saving to sqlite ===== #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates conn = sqlite3.connect('selfdata_01.db') excel_pomodoro_total_daily.to_sql('pomo_excel_daily', conn, if_exists='append') conn.close()