Python get_only_new_data_df примеры использования

Язык программирования: Python

Пространство имен/Пакет: Cleaning.cutil

Метод/Функция: get_only_new_data_df

Примеров на hotexamples.com: 7

Python get_only_new_data_df - 7 примеров найдено. Это лучшие примеры Python кода для Cleaning.cutil.get_only_new_data_df, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: c_google_fit_summary.py Проект: jin-park-dev/SelfData

def clean_google_fit_summary():
    # ===== Config ========

    gfit_file = r'\Takeout\Fit\Daily Aggregations\Daily Summaries.csv'
    fit_agg_path = root_dir + gfit_file

    # fit_agg_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\Takeout\Fit\Daily Aggregations\Daily Summaries.csv'


    # my_data_master=


    # ===== Google fit agg collection Cleaning ========

    cols = ['Date', 'Calories (kcal)', 'Step count', 'Cycling duration (ms)' , 'Inactive duration (ms)', 'Walking duration (ms)', 'Running duration (ms)']


    g_fit = pd.read_csv(fit_agg_path, usecols=cols)

    # Fixing date

    g_fit['Date'] = pd.to_datetime(g_fit['Date'])



    # Reversing index for future
    g_fit = g_fit.reindex(index=g_fit.index[::-1])
    g_fit = g_fit.reset_index()
    g_fit = g_fit.drop('index', axis=1)


    # === Filling empty values with 0 ===
    g_fit = g_fit.fillna(value=0, axis='columns')

    # print(g_fit.head())
    # print(g_fit['Step count'].head())

    # === Fixing columns (Dtypes) ===

    dtype_map = {'Calories (kcal)': float, 'Step count': int, 'Inactive duration (ms)': int, 'Cycling duration (ms)': int, 'Walking duration (ms)': int, 'Running duration (ms)': int}

    # g_fit['Calories (kcal)'] = g_fit['Calories (kcal)'].astype(float)
    g_fit['Step count'] = g_fit['Step count'].astype(np.int64)
    g_fit['Inactive duration (ms)'] = g_fit['Inactive duration (ms)'].astype(np.int64)
    g_fit['Cycling duration (ms)'] = g_fit['Cycling duration (ms)'].astype(np.int64)
    g_fit['Walking duration (ms)'] = g_fit['Walking duration (ms)'].astype(np.int64)
    g_fit['Running duration (ms)'] = g_fit['Running duration (ms)'].astype(np.int64)

    # print(g_fit.dtypes)
    # print(g_fit.columns)

    # print(g_fit.head())


    # Dropping all the dates already in database
    tbl_name = daily_config['google_fit']['tbl_name']

    g_fit = cutil.get_only_new_data_df(g_fit, tbl_name)



    # ===== Daylio saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    g_fit.to_sql('google_fit', conn, if_exists='append') #
    conn.close()

Пример #2

Показать файл

Файл: c_pomodoro_excel_v2.py Проект: jin-park-dev/SelfData

def clean_pomodoro_excel():

    # ===== Config ========

    excel_pomodoro_path = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro'

    # ===== Excel collection of multiple excel files ========

    excel_files = []
    excel_pomodoro_total_daily = pd.DataFrame(columns=['Date', 'pomo_total'])
    excel_pomo = pd.DataFrame(columns=['Date', 'subject', 'subject_none'])
    excel_noo = pd.DataFrame(columns=['Date', 'nootropic'])

    # =for dev purpose only=
    # excel_single_file_last = r'11-13.xlsx'
    # excel_single_file_first = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\04-11.xlsx'
    # excel_single_file_first = r'04-11.xlsx'
    excel_single_file_custom = r'05-24.xlsx'

    # ===== Reading, processing Excel data for SQLDB ========
    # for file in [excel_single_file_custom]:
    for file in os.listdir(excel_pomodoro_path):

        # for file in dev_smaller_files:
        try:
            print('file reading: {} \n'.format(file))
            xl = pd.ExcelFile(excel_pomodoro_path + "\\" + file)
            #xl = pd.ExcelFile(file) #dev purpose only of reading single file.
            #10-12.xlsx is start of nootropic.
            df_day = xl.parse(
                "Sheet1",
                names=['time', 'subject', 'subject_none', 'nootropic'],
                parse_cols="B,C,D,E")
            # For older when nootropic wasn't there.
            if df_day['nootropic'].iloc[0] != 'Nootropics':
                df_day = df_day.drop('nootropic', axis=1)
            # Gets rid of any empty data columns picked up on bottom of sheet, and excel-label at top.
            # If there's no time, there's no data.

            df_day = df_day.dropna(axis=0, how='any', subset=['time'])
            df_day = df_day[
                df_day['time'] !=
                'Time']  # Gets rid of excel-label but dropna is seems already doing it. But dropped index require this. Do i need drop index?
            df_day = df_day.reset_index(drop=True)

            # 1. excel_pomodoro_total_daily
            # Here for now I simplify to get total pomo per day, instead of having detailed day look
            df_day_temp = df_day.dropna(axis=0, how='any', subset=['subject'])
            #for now adding file names in simplified way
            day_sum = {'Date': [file], 'pomo_total': [len(df_day_temp)]}
            df_day_sum = pd.DataFrame(day_sum, columns=['Date', 'pomo_total'])
            excel_pomodoro_total_daily = excel_pomodoro_total_daily.append(
                df_day_sum)
            # print('\n\n')

            # 2. excel_raw, excel_noo

            # Add date column

            cur_month_day = file.replace('.xlsx', '')
            cur_year = os.path.basename(os.path.dirname(
                excel_pomodoro_path))  #This needs to be changed later
            cur_date = f'{cur_year}-{cur_month_day} '
            df_day['time'] = df_day['time'].astype(str)
            df_day['Date'] = cur_date + df_day['time']
            df_day['record_date'] = cur_date  #2x + df_day['time']

            # Fixes cases where time goes past midnight so I have next date.
            # Added the recorded_date for today's pomodoro filters in SQL for future uses.
            try:
                df_day['Date'] = pd.to_datetime(df_day['Date'],
                                                format="%Y-%m-%d %H:%M:%S")
                #2xdf_day['record_date'] = pd.to_datetime(df_day['record_date'], format="%Y-%m-%d ")

            except ValueError:
                next_day = False
                for index, row in df_day.iterrows():
                    if next_day is False:
                        if row['time'] == '1900-01-01 00:00:00':
                            # print('ah... next day')
                            next_day = True
                            cur_date_datetime = datetime.strptime(
                                cur_date, "%Y-%m-%d ")
                            tmr_date_datetime = cur_date_datetime + timedelta(
                                days=1)
                            tmr_date_str = tmr_date_datetime.strftime(
                                "%Y-%m-%d %H:%M:%S")
                            df_day.loc[index, 'Date'] = tmr_date_str
                            #2xdf_day.loc[index, 'record_date'] = cur_date_datetime
                            continue
                        row['Date'] = cur_date + row['time']
                        # row['record_date'] = cur_date
                    elif next_day is True:
                        # print('next_day stuff')
                        cur_date_datetime = datetime.strptime(
                            cur_date, "%Y-%m-%d ")
                        tmr_date_datetime = cur_date_datetime + timedelta(
                            days=1)
                        tmr_date_str = tmr_date_datetime.strftime(
                            "%Y-%m-%d %H:%M:%S")
                        df_day.loc[index, 'Date'] = tmr_date_str
                        #2xdf_day.loc[index, 'record_date'] = cur_date_datetime

                df_day['Date'] = pd.to_datetime(df_day['Date'],
                                                format="%Y-%m-%d %H:%M:%S")
                #2xdf_day['record_date'] = pd.to_datetime(df_day['record_date'], format="%Y-%m-%d ")

        # prep for excel_pomo
            df_pomo = df_day[[
                'Date', 'record_date', 'subject', 'subject_none'
            ]]
            # Add each day to bigger df
            excel_pomo = excel_pomo.append(df_pomo)

            if 'nootropic' in df_day.columns:
                df_noo = df_day[['Date', 'nootropic']]
                df_noo = df_noo.dropna(axis=0, how='any',
                                       subset=['nootropic'
                                               ])  # only keep time with data
                excel_noo = excel_noo.append(df_noo)

        except IndexError:
            print("index error on file: {}".format(file))

    print("\nall files read")

    # For now assume it's only 2017 date. But later use folder to smartly decide. Probably in different stage of file.
    excel_pomodoro_total_daily['pomo_total'] = excel_pomodoro_total_daily[
        'pomo_total'].astype(int)
    excel_pomodoro_total_daily['Date'] = excel_pomodoro_total_daily[
        'Date'].str.replace('.xlsx', '')
    excel_pomodoro_total_daily[
        'Date'] = "2017-" + excel_pomodoro_total_daily['Date']
    excel_pomodoro_total_daily['Date'] = pd.to_datetime(
        excel_pomodoro_total_daily['Date'], format="%Y-%m-%d")

    # print(excel_pomodoro_total_daily)
    # print(excel_pomodoro_total_daily.dtypes)

    # print(excel_pomo.head())
    # print(excel_pomo.tail())
    #
    # print(excel_noo.head())
    # print(excel_noo.tail())

    # TODO: IMPORTANT!!! When appending ID is no longer going to be unique?????????????????? I think?????????????? Should I use SQLALCHEMY???
    # For now since it's getting it from start if I change below from append to remaking whole data it'll be ok.
    excel_pomo = excel_pomo.reset_index()
    excel_pomo = excel_pomo.drop('index', axis=1)
    print(excel_pomo)

    # Dropping all the dates already in database

    # TODO: DEL?
    # tbl_name_daily = daily_config['pomo_excel_daily']['tbl_name']
    # excel_pomodoro_total_daily = cutil.get_only_new_data_df(excel_pomodoro_total_daily, tbl_name_daily)

    tbl_name_pomo = daily_config['excel_pomodoro']['tbl_name']
    # Don't need for "if_exists='replace'"
    # excel_pomo = cutil.get_only_new_data_df(excel_pomo, tbl_name_pomo)

    tbl_name_noo = daily_config['excel_nootropic']['tbl_name']
    excel_noo = cutil.get_only_new_data_df(excel_noo, tbl_name_noo)

    # ===== Excel pomo saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    # TODO: DEL?
    excel_pomodoro_total_daily.to_sql('pomo_excel_daily',
                                      conn,
                                      if_exists='append')
    excel_pomo.to_sql(tbl_name_pomo, conn, if_exists='replace')
    excel_noo.to_sql(tbl_name_noo, conn, if_exists='append')

    conn.close()

Пример #3

Показать файл

def clean_weather():

    # ===== Config ========

    # data_root =

    weather_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\daily-text'

    # my_data_master=

    # ===== Weight collection of multiple csvs ========

    weather_files = []
    weather = pd.DataFrame()

    names = [
        'Time', 'Temp', 'Humid', 'DewPt', 'Press', 'WindSp', 'WindDr', 'Sun',
        'Rain', 'Start', 'MxWSpd'
    ]

    drop_row_range = [i for i in range(8)]

    # For dev purpose, doing whole thing with smaller set of files.
    dev_smaller_files = os.listdir(weather_path)
    dev_smaller_files = dev_smaller_files[-50:-1]

    # ===== Reading CSV + Weight collection Cleaning ========
    for file in os.listdir(weather_path):
        # for file in dev_smaller_files:
        try:
            print('file reading: {} \n'.format(file))
            weather_monthly = pd.read_csv(weather_path + "\\" + file,
                                          sep='\t',
                                          names=names)
            weather_monthly = weather_monthly.drop(
                weather_monthly.index[drop_row_range])
            # date is in file name itself so this needs to be added here.

            weather_monthly['DateOnly'] = file
            # print(weather_monthly['DateOnly'].head())
            weather_monthly['DateOnly'] = pd.to_datetime(
                weather_monthly['DateOnly'], format="%Y_%m_%d")
            # print(weather_monthly['DateOnly'].head())

            # TODO: Bug somewhere in data conversion. something to do with certain file or way it's processing.
            weather_monthly['Date'] = file + " " + weather_monthly['Time']
            # print(weather_monthly['Date'].head())
            weather_monthly['Date'] = pd.to_datetime(weather_monthly['Date'],
                                                     format="%Y_%m_%d %H:%M")
            # print(weather_monthly['Date'].head())

            # print(weather_monthly['Temp'])
            # print(weather_monthly)

            weather_monthly['Temp'] = weather_monthly['Temp'].astype(
                np.float64)

            weather = weather.append(weather_monthly)

            print('\n\n')

        except IndexError:
            print("index error on file: {}".format(file))
            print("Might have no data for this date")

    print("all files read")

    # === Fixing columns (Dtypes) ===
    weather = weather.drop('Time', axis=1)

    # doing it in loop to catch bug
    # try:
    #     weather['Date'] = pd.to_datetime(weather['Date'], format="%Y_%m_%d %H:%M")
    # except:
    #     print("Error in date conversion")
    # try:
    #     weather['DateOnly'] = pd.to_datetime(weather['DateOnly'], format="%Y_%m_%d")
    # except:
    #     print("Error in DAteOnly conversion")

    # weather['Temp'] = weather['Temp'].astype(np.float64)
    weather['Humid'] = weather['Humid'].astype(np.int64)
    weather['DewPt'] = weather['DewPt'].astype(np.float64)

    weather['Press'] = weather['Press'].astype(np.int64)
    weather['WindSp'] = weather['WindSp'].astype(np.float64)

    weather['Sun'] = weather['Sun'].astype(np.float64)
    weather['Rain'] = weather['Rain'].astype(np.float64)
    weather['MxWSpd'] = weather['MxWSpd'].astype(np.int64)

    # == Column ordering with date first ==

    columns_order = [
        'DateOnly', 'Date', 'Temp', 'Humid', 'DewPt', 'Press', 'WindSp',
        'WindDr', 'Sun', 'Rain', 'Start', 'MxWSpd'
    ]

    weather = weather[columns_order]

    # Reversing index for future
    weather = weather.sort_values('Date', axis=0)
    # weight = weight.reindex(index=weight.index[::-1])
    weather = weather.reset_index()
    weather = weather.drop('index', axis=1)

    #=========================
    # ====== Creating daily Database
    #=========================

    weather_hourly = weather.copy()
    # print(weather_hourly.head(10))
    weather_hourly = weather_hourly.groupby('DateOnly')
    # print(weather_hourly.head(10))

    weather_daily = weather_hourly.agg({
        'Temp': ['mean', 'min', 'max'],
        'Humid': 'mean',
        'Sun': 'max',
        'Rain': 'max'
    })

    # print(weather_daily.head(2))

    # weather_daily = weather_daily.rename(columns={

    # weather_daily.columns = weather_daily.columns.droplevel(0)

    # Making 2 leveled columns into 1 again, but adding level 2 first. (e.g temp_max from just max)
    weather_daily.columns = [
        "_".join(x) for x in weather_daily.columns.ravel()
    ]
    weather_daily = weather_daily.rename(columns={
        'Sun_max': 'Sun_total',
        'Rain_max': 'Rain_total'
    })
    weather_daily = weather_daily.round(1)
    weather_daily = weather_daily.reset_index()
    # print(weather_daily.columns)

    # Naming to Date
    weather_daily['Date'] = weather_daily['DateOnly']
    weather_daily = weather_daily.drop('DateOnly', axis=1)

    col_reorder = [
        'Date', 'Temp_mean', 'Temp_min', 'Temp_max', 'Humid_mean', 'Sun_total',
        'Rain_total'
    ]
    weather_daily = weather_daily[col_reorder]

    # print(weather_daily.columns)
    # print(weather_daily.head(22))

    # print("=======")
    # # print(weather_hourly.shape)
    # print("=======")
    # print(weather_hourly.head())
    # print("=======")
    # print(weather_hourly.tail())
    # print("=======")
    # print(weather_hourly.dtypes)

    # Dropping all the dates already in database
    tbl_name = daily_config['weather_daily']['tbl_name']

    weather_daily = cutil.get_only_new_data_df(weather_daily, tbl_name)

    # ===== weight saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    # weather.to_sql('weather_hourly', conn) #, if_exists='append'

    weather_daily.to_sql('weather_daily', conn, if_exists='append')

    conn.close()

Пример #4

Показать файл

Файл: c_pomodoro_kanban.py Проект: jin-park-dev/SelfData

def clean_pomodoro_kanban():

    # ===== Config ========

    # data_root =

    kanban_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\Today board.csv'

    # my_data_master=

    # ===== Kanban collection Cleaning ========

    # cols = ['Name', 'Color', 'Time spent', 'Labels', 'Comments', 'Grouping date', 'Created timestamp']
    cols = ['Time spent', 'Grouping date']
    dtype = {'Time spent': np.float64}
    kanban = pd.read_csv(kanban_path, usecols=cols, dtype=dtype)

    # Fixing date
    kanban['Date'] = pd.to_datetime(kanban['Grouping date'], format='%Y-%m-%d')
    kanban.drop(['Grouping date'], inplace=True, axis=1)

    # Finding total pomo per day

    kanban = kanban.groupby('Date')
    kanban = kanban.sum()

    kanban = kanban.reset_index()

    # kanban['Time spent'] = kanban['Time spent'].round(2)
    # kanban['Time spent'] = kanban['Time spent'].astype(str)

    # print(kanban)

    # print(kanban['Time spent'].str.split('.').tolist())
    # kanban_temp = pd.DataFrame(kanban['Time spent'].str.split('.').tolist(), columns = ['Hour', 'Min'])
    """
    
    kanban = pd.concat([kanban, kanban_temp], axis=1)
    kanban['Hour'] = kanban['Hour'].astype(np.int)
    kanban['Min'] = kanban['Min'].astype(np.int)
    kanban['Min2'] = (kanban['Min'] * 60) % 60
    kanban['pomo_total'] = kanban['Hour']*60 + kanban['Min']
    """
    def dec_norm(time):
        hours = int(time)
        minutes = (time * 60) % 60
        seconds = (time * 3600) % 60
        # print("%d:%02d:%02d" % (hours, minutes, seconds))
        total_min = hours * 60 + minutes
        return total_min

    #Covert decimal time to normal time
    kanban['Total Min'] = kanban['Time spent'].apply(dec_norm)

    kanban['pomo_total'] = kanban['Total Min'] / 26
    # Round to nearest
    kanban['pomo_total'] = kanban['pomo_total'].round(0)
    kanban = kanban.drop(['Time spent', 'Total Min'], axis=1)

    # print(kanban)

    # Dropping all the dates already in database

    tbl_name = daily_config['pomo_kanban_daily']['tbl_name']

    kanban = cutil.get_only_new_data_df(kanban, tbl_name)

    # ===== Kanban saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    kanban.to_sql('pomo_kanban_daily', conn, if_exists='append')
    conn.close()

Пример #5

Показать файл

Файл: c_weight.py Проект: jin-park-dev/SelfData

def clean_weight():

    # ===== Config ========

    fitbit_path = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\201x TO 2017-07-24\weight'

    # ===== Weight collection of multiple csvs ========

    weight_files = []
    weight = pd.DataFrame()

    #fitbit_path+"\\"+"fitbit_export_20170725 (1).csv"
    names = ['Date', 'Weight', 'BMI', 'Fat']

    # Not parsing yet

    # ===== Reading CSV + Weight collection Cleaning ========
    for file in os.listdir(fitbit_path):
        weight_montly = pd.read_csv(
            fitbit_path + "\\" + file, names=names,
            dtype={'Date':
                   object})  #, parse_dates=['Date'], date_parser=date_parser
        weight_montly = weight_montly.drop(weight_montly.index[[0, 1]])

        try:
            weight_montly['Date'] = pd.to_datetime(weight_montly['Date'],
                                                   format="%d-%m-%Y")
        except:
            print(file)
            print(
                'Problem in parsing date %d-%m-%Y format. Trying to Parse with 2nd format.'
            )
            try:
                weight_montly['Date'] = pd.to_datetime(weight_montly['Date'],
                                                       format="%d/%m/%Y")

                print('parsed date in "%d/%m/%Y format instead')
            except:
                print('DATE ISSUE: error in parsing date %d-%m-%Y format.')

        weight = weight.append(weight_montly)

        ###

    # === Fixing columns (Dtypes) ===

    weight['Weight'] = weight['Weight'].astype(np.float64)
    weight['BMI'] = weight['BMI'].astype(np.float64)
    weight['Fat'] = weight['Fat'].astype(np.float64)

    # Reversing index for future
    weight = weight.sort_values('Date', axis=0)
    # weight = weight.reindex(index=weight.index[::-1])
    weight = weight.reset_index()
    weight = weight.drop('index', axis=1)

    # print("=======")
    # print(weight.shape)
    # print("=======")
    # print(weight.head(3))
    # print("=======")
    # print(weight.tail())
    # print("=======")
    # print(weight.dtypes)
    # print(weight['Date'].head(3))
    """
    # ==== Fill in with estimate missing dates =====
    
    print("=======")
    print("=======")
    
    # print(weight['Date'].iloc[0])
    # print(weight['Date'].iloc[len(weight)-1])
    
    # weight = weight.groupby('Date')
    # weight = weight['Date'].resample
    
    print(weight.head(10))
    start_date = weight['Date'].iloc[0]
    latest_date = weight['Date'].iloc[len(weight)-1]
    all_dates = pd.date_range(start_date, latest_date)
    
    weight = pd.DatetimeIndex(weight.index)
    weight = weight.reindex(all_dates, fill_value="NaN")
    
    
    print("=======")
    print(weight.shape)
    print(weight.head(10))
    
    print("=======")
    """

    # Dropping all the dates already in database
    tbl_name = daily_config['weight']['tbl_name']

    weight = cutil.get_only_new_data_df(weight, tbl_name)

    # ===== weight saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    weight.to_sql('weight', conn, if_exists='append')

    conn.close()

Пример #6

Показать файл

Файл: c_daylio.py Проект: jin-park-dev/SelfData

def clean_daylio():
    # ===== Config ========

    daylio_file = r'\daylio_export.csv'
    path_daylio = root_dir + daylio_file
    # path_daylio = r'D:\OneDrive\0 My Files\1 Documents\4 Raw Data\2 My Data\1_current\daylio_export.csv'


    # my_data_master=


    # ===== Daylio collection Cleaning ========

    daylio = pd.read_csv(path_daylio)


    # Fixing date

    daylio['year'] = daylio['year'].apply(str)
    daylio['date-str'] = daylio['date'] + ' ' + daylio['year'] + ' ' + daylio['time']
    daylio['date-datetime'] = pd.to_datetime(daylio['date-str'])

    # Dropping unrequired columns

    cols_drop_after_date = ['year', 'date', 'weekday', 'time', 'date-str']
    daylio = daylio.drop(cols_drop_after_date, axis=1)

    # Normalizing mood

    mood_map = {'awful': 0, 'fugly': 2.5, 'meh': 5, 'good': 7.5, 'rad': 10}
    daylio['mood'] = daylio['mood'].map(mood_map)


    # print(daylio.head())
    # print(daylio.dtypes)

    # Make date to each day
    daylio['Date'] = daylio['date-datetime'].dt.strftime('%Y-%m-%d')
    daylio = daylio.drop('date-datetime', axis=1)
    daylio = daylio[['Date', 'mood']]




    # Making average of the day from any hour of the day
    # WARNING: It drops other non-mean-able column automatically. But there's not much data on those so it's ok for now.
    daylio = daylio.groupby('Date')
    daylio = daylio.mean()
    daylio = daylio.reset_index()

    # Making date to date format again.
    #
    # print(daylio['Date'].head())
    # print(pd.api.types.is_string_dtype(daylio['Date']))
    daylio['Date'] = pd.to_datetime(daylio['Date'], format="%Y-%m-%d")
    # print(daylio.dtypes)

    """
    # Just checking there's no duplicate dates now.
    dates_counts = daylio['Date'].value_counts()
    dates_counts = dates_counts[dates_counts != 1]
    print("Below should be empty if there are no duplicate dates")
    print(dates_counts)
    """


    # print(daylio.head(55))

    # Dropping all the dates already in database
    tbl_name = daily_config['mood']['tbl_name']
    daylio = cutil.get_only_new_data_df(daylio, tbl_name)


    # Reversing index order
    # Not needed since index reset

    """
    daylio = daylio.reindex(index=daylio.index[::-1])
    daylio = daylio.reset_index()
    daylio = daylio.drop('index', axis=1)
    """



    # ===== Daylio saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    daylio.to_sql('mood', conn, if_exists='append') #
    conn.close()

Пример #7

Показать файл

def clean_pomodoro_excel():

    # ===== Config ========

    excel_pomodoro_path = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro'

    # ===== Excel collection of multiple excel files ========

    excel_files = []
    excel_pomodoro_total_daily = pd.DataFrame(columns=['Date', 'pomo_total'])

    # =for dev purpose only=
    # excel_single_file_last = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\08-23.xlsx'
    # excel_single_file_first = r'D:\OneDrive\0 My Files\1 Documents\2 MS Office\1 Excel\2017\2 Pomodoro\04-11.xlsx'

    # ===== Reading CSV + Weight collection Cleaning ========
    # for file in [excel_single_file_last]:
    for file in os.listdir(excel_pomodoro_path):

        # for file in dev_smaller_files:
        try:
            print('file reading: {} \n'.format(file))
            xl = pd.ExcelFile(excel_pomodoro_path + "\\" + file)
            #xl = pd.ExcelFile(file) #dev purpose only of reading single file.
            df_day = xl.parse("Sheet1",
                              names=['time', 'subject', 'subject_non'],
                              parse_cols="B,C,D")
            df_day = df_day.dropna(axis=0, how='any', subset=['time'])
            df_day = df_day[df_day['time'] != 'Time']
            df_day = df_day.reset_index(drop=True)

            # Here for now I simplify to get total pomo per day, instead of having detailed day look
            df_day_temp = df_day.dropna(axis=0, how='any', subset=['subject'])
            #for now adding file names in simplified way
            day_sum = {'Date': [file], 'pomo_total': [len(df_day_temp)]}
            df_day_sum = pd.DataFrame(day_sum, columns=['Date', 'pomo_total'])
            excel_pomodoro_total_daily = excel_pomodoro_total_daily.append(
                df_day_sum)
            # print('\n\n')

        except IndexError:
            print("index error on file: {}".format(file))

    print("all files read")

    # For now assume it's only 2017 date. But later use folder to smartly decide. Probably in different stage of file.
    excel_pomodoro_total_daily['pomo_total'] = excel_pomodoro_total_daily[
        'pomo_total'].astype(int)
    excel_pomodoro_total_daily['Date'] = excel_pomodoro_total_daily[
        'Date'].str.replace('.xlsx', '')
    excel_pomodoro_total_daily[
        'Date'] = "2017-" + excel_pomodoro_total_daily['Date']
    excel_pomodoro_total_daily['Date'] = pd.to_datetime(
        excel_pomodoro_total_daily['Date'], format="%Y-%m-%d")

    # print(excel_pomodoro_total_daily)
    # print(excel_pomodoro_total_daily.dtypes)

    # Dropping all the dates already in database

    tbl_name = daily_config['pomo_excel_daily']['tbl_name']

    excel_pomodoro_total_daily = cutil.get_only_new_data_df(
        excel_pomodoro_total_daily, tbl_name)

    # ===== Excel pomo saving to sqlite =====

    #TODO: need to look at how to manage data if it's already in database. Way it's working now i'll get duplicates

    conn = sqlite3.connect('selfdata_01.db')
    excel_pomodoro_total_daily.to_sql('pomo_excel_daily',
                                      conn,
                                      if_exists='append')

    conn.close()