예제 #1
0
async def check_load_new_data(logger):
    """
    Checks for new files, checks length of old ones for updates, and processes/commits new data to the database.

    :param logger: logging logger at module level
    :return: boolean, did it run/process new data?
    """

    logger.info('Running check_load_new_data()')

    try:
        from summit_core import picarro_logs_path as data_path
        from summit_core import picarro_dir as rundir
        from summit_core import connect_to_db, get_all_data_files, check_filesize
        from summit_picarro import Base, DataFile, Datum
        from sqlalchemy.orm.exc import MultipleResultsFound
        from summit_errors import EmailTemplate, sender, processor_email_list

        from pandas.errors import ParserError
    except ImportError as e:
        logger.error('ImportError occurred in check_load_new_data()')
        send_processor_email(PROC, exception=e)
        return False

    try:
        engine, session = connect_to_db('sqlite:///summit_picarro.sqlite',
                                        rundir)
        Base.metadata.create_all(engine)
    except Exception as e:
        logger.error(
            f'Exception {e.args} caused database connection to fail in check_load_new_data()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        db_files = session.query(DataFile)
        db_filenames = [d.name for d in db_files.all()]

        all_available_files = get_all_data_files(data_path, '.dat')

        files_to_process = session.query(DataFile).filter(
            DataFile.processed == False).all()

        for file in all_available_files:
            try:
                db_match = db_files.filter(
                    DataFile._name == file.name).one_or_none()
            except MultipleResultsFound:
                logger.warning(
                    f'Multiple results found for file {file.name}. The first was used.'
                )
                db_match = db_files.filter(DataFile._name == file.name).first()

            if file.name not in db_filenames:
                files_to_process.append(DataFile(file))
            elif check_filesize(file) > db_match.size:
                # if a matching file was found and it's now bigger, append for processing
                logger.info(
                    f'File {file.name} had more data and was added for procesing.'
                )
                files_to_process.append(db_match)

        if not files_to_process:
            logger.warning('No new data was found.')
            return False

        for ind, file in enumerate(files_to_process):
            files_to_process[ind] = session.merge(
                file
            )  # merge files and return the merged object to overwrite the old
            logger.info(f'File {file.name} added for processing.')
        session.commit()

        for file in files_to_process:
            try:
                df = pd.read_csv(file.path, delim_whitespace=True)
            except EmptyDataError as e:
                logger.error(
                    f'Exception {e.args} occurred while reading {file.name}')
                send_processor_email(PROC, exception=e)
                continue
            except ParserError as e:
                logger.error(
                    f'Pandas ParserError occurred while reading {file.name}.')
                from summit_errors import send_processor_warning
                try:
                    df = pd.read_csv(file.path,
                                     delim_whitespace=True,
                                     error_bad_lines=False)
                    send_processor_warning(PROC, 'Dataframe', (
                        f'The Picarro Processor failed to read file {file.name} '
                        +
                        'It was re-parsed, skipping unreadable lines, but should be'
                        + ' investigated.'))

                except Exception as e:
                    logger.error(
                        f'Exception {e.args} occurred in check_load_new_data() while reading a file.'
                        + f' The file was {file.name}')
                    send_processor_email(PROC, exception=e)
                    continue
            except Exception as e:
                logger.error(
                    f'Exception {e.args} occurred in check_load_new_data() while reading a file.'
                    + f' The file was {file.name}')
                send_processor_email(PROC, exception=e)
                continue

            original_length = len(df)

            df.dropna(axis=0, how='any', inplace=True)

            new_length = len(df)
            diff = original_length - new_length

            if diff:
                logger.warning(
                    f'Dataframe contained {diff} null values in {file.name}.')
                from summit_errors import send_processor_warning

                send_processor_warning(PROC, 'DataFrame', (
                    f'The Picarro Processor cut {diff} lines from a dataframe after reading it.\n'
                    +
                    f'{file.name} should be investigated and cleaned if necessary.'
                ))

            # CO2 stays in ppm
            df['CO_sync'] *= 1000  # convert CO to ppb
            df['CH4_sync'] *= 1000  # convert CH4 to ppb
            df['CH4_dry_sync'] *= 1000

            df_list = df.to_dict('records')  # convert to list of dicts

            data_list = []
            for line in df_list:
                data_list.append(Datum(line))

            if data_list:
                data_dates = [d.date for d in data_list]
                dates_already_in_db = session.query(Datum.date).filter(
                    Datum.date.in_(data_dates)).all()
                dates_already_in_db[:] = [d.date for d in dates_already_in_db]

                for d in data_list:
                    if d.date not in dates_already_in_db:
                        d.file_id = file.id  # relate Datum to the file it originated in
                        session.add(d)
            else:
                logger.info(f'No new data created from file {file.name}.')

            file.processed = True
            file.size = check_filesize(file.path)
            logger.info(f'All data in file {file.name} processed.')
            session.commit()

        return True

    except Exception as e:
        logger.error(f'Exception {e.args} occurred in check_load_new_data().')
        send_processor_email(PROC, exception=e)
        return False
예제 #2
0
async def move_log_files(logger):
    """
    Runs continuously and sleeps for 10 minutes at a time. Comb the directories for new data files and move any that
    are new or have been updated. This WILL NOT handle turning over a new year in the daily files well, as they have no
    year in the filename. I can't fix that.

    :param logger: logging logger to log to
    :return: boolean, True if ran without errors
    """

    while True:
        try:
            from summit_errors import send_processor_email, EmailTemplate, sender, processor_email_list
            from shutil import copy
            import datetime as dt
            import os
        except ImportError:
            logger.error('ImportError occurred in move_log_files()')
            return False

        try:
            engine, session = connect_to_db('sqlite:///summit_core.sqlite',
                                            core_dir)
            MovedFile.__table__.create(engine, checkfirst=True)
        except Exception as e:
            logger.error(
                f'Exception {e.args} prevented connection to the database in move_log_files()'
            )
            send_processor_email('Core', exception=e)
            return False

        try:
            logger.info('Running move_log_files()')

            sync_paths = [
                methane_logs_sync, voc_logs_sync, daily_logs_sync,
                picarro_logs_sync
            ]
            data_paths = [
                methane_logs_path, voc_logs_path, daily_logs_path,
                picarro_logs_path
            ]
            data_types = ['methane', 'voc', 'daily', 'picarro']
            file_types = ['.txt', '.txt', '.txt', '.dat']

            for sync_path, type_, data_path, file_type in zip(
                    sync_paths, data_types, data_paths, file_types):

                # change the name of the daily files before reading them in (implemented: 2/14/2020)
                for d in get_all_data_files(daily_logs_sync, '.txt'):
                    if check_path_date(d).year == dt.datetime.now(
                    ).year and "2020" not in str(d):
                        name, extension = os.path.splitext(d)
                        d.rename(name + '_' + str(dt.datetime.now().year) +
                                 extension)

                sync_files = [
                    MovedFile(path, type_, 'sync', check_filesize(path))
                    for path in get_all_data_files(sync_path, file_type)
                ]

                data_files = (session.query(MovedFile).filter(
                    MovedFile.location == 'data').filter(
                        MovedFile.type == type_).all())
                moved_data_files = [d.name for d in data_files]

                for file in sync_files:
                    if file.name not in moved_data_files:
                        try:
                            copy(file.path, data_path)  # will overwrite
                        except PermissionError:
                            logger.error(
                                f'File {file.name} could not be moved due to a permissions error.'
                            )
                            from summit_errors import send_processor_warning
                            send_processor_warning(
                                PROC, 'PermissionError',
                                f'File {file.name} could not be moved due a permissions error.\n'
                                +
                                'Copying/pasting the file, deleting the old one, and renaming '
                                +
                                'the file to its old name should allow it to be processed.\n'
                                + 'This will require admin privelidges.')
                            continue
                        file.path = data_path / file.name
                        file.location = 'data'
                        session.merge(file)
                        logger.info(
                            f'File {file.name} moved to data directory.')
                    else:
                        matched_file = search_for_attr_value(
                            data_files, 'name', file.name)
                        if file.size > matched_file.size:
                            try:
                                copy(file.path, data_path)  # will overwrite
                            except PermissionError:
                                logger.error(
                                    f'File {file.name} could not be moved due to a permissions error.'
                                )
                                from summit_errors import send_processor_warning

                                send_processor_warning(
                                    PROC, 'PermissionError',
                                    f'File {file.name} could not be moved due a permissions error.\n'
                                    +
                                    'Copying/pasting the file, deleting the old one, and renaming '
                                    +
                                    'the file to its old name should allow it to be processed.\n'
                                    + 'This will require admin privelidges.')
                                continue

                            matched_file.size = check_filesize(
                                matched_file.path)
                            session.merge(matched_file)
                            logger.info(
                                f'File {matched_file.name} updated in data directory.'
                            )

            session.commit()

            session.close()
            engine.dispose()

            import gc
            gc.collect()

            for i in range(20):
                await asyncio.sleep(30)

        except Exception as e:
            logger.error(f'Exception {e.args} occurred in move_log_files().')
            send_processor_email('Core', exception=e)
            session.close()
            engine.dispose()
            return False
예제 #3
0
async def dual_plot_methane(logger):
    """
    Connects to both the methane [gc] and picarro databases to create an overlayed plot of both data.

    :param logger: logger, to log events to
    :return: Boolean, True if it ran without error and created data, False if not
    """

    PROC = 'Methane DualPlotter'

    try:
        from pathlib import Path
        from summit_core import core_dir, Config
        from summit_core import methane_dir
        from summit_core import picarro_dir
        from summit_core import connect_to_db, create_daily_ticks, TempDir, Plot, add_or_ignore_plot
        from summit_picarro import Datum
        from summit_methane import Base, GcRun, summit_methane_plot

        from summit_picarro import Base as PicarroBase

        remotedir = r'/data/web/htdocs/instaar/groups/arl/summit/plots'

    except ImportError as e:
        logger.error('ImportError occurred in dual_plot_methane()')
        send_processor_email(PROC, exception=e)
        return False

    try:
        gc_engine, gc_session = connect_to_db(
            'sqlite:///summit_methane.sqlite', methane_dir)
        Base.metadata.create_all(gc_engine)

        picarro_engine, picarro_session = connect_to_db(
            'sqlite:///summit_picarro.sqlite', picarro_dir)
        PicarroBase.metadata.create_all(picarro_engine)
    except Exception as e:
        logger.error(
            f'Exception {e.args} prevented connection to the database in dual_plot_methane()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        core_engine, core_session = connect_to_db(
            'sqlite:///summit_core.sqlite', core_dir)
        Plot.__table__.create(core_engine, checkfirst=True)
        Config.__table__.create(core_engine, checkfirst=True)

        twoplot_config = core_session.query(Config).filter(
            Config.processor == PROC).one_or_none()

        if not twoplot_config:
            twoplot_config = Config(
                processor=PROC
            )  # use all default values except processor on init
            core_session.add(twoplot_config)
            core_session.commit()

    except Exception as e:
        logger.error(
            f'Error {e.args} prevented connecting to the core database in plot_new_data()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        logger.info('Running dual_plot_methane()')

        newest_picarro_data_point = (picarro_session.query(Datum.date).filter(
            Datum.mpv_position == 1).order_by(Datum.date.desc()).first()[0])
        try:
            newest_gc_data_point = (gc_session.query(GcRun.date).filter(
                GcRun.median != None).filter(GcRun.standard_rsd < .02).filter(
                    GcRun.rsd < .02).order_by(GcRun.date.desc()).first()[0])
        except TypeError:
            logger.error(
                'NoneType not subscriptable encountered due to lack of methane data to query.'
            )
            from summit_errors import send_processor_warning
            send_processor_warning(
                PROC, 'Dual Plotter',
                '''The Methane Dual Plotter could not query any GcRuns for methane data.\n
                                   Check the database to make sure there are in fact GcRuns with medians and valid rsds.
                                   \nThis often happens when the methane database is remade without re-setting 
                                   the filesize and pa_startlie in the config table of Core database, 
                                   thus no peaks are found.''')
            return False

        newest_data_point = max(newest_picarro_data_point,
                                newest_gc_data_point)

        if newest_data_point <= twoplot_config.last_data_date:
            logger.info('No new data was found to plot.')
            core_session.close()
            core_engine.dispose()
            picarro_session.close()
            picarro_engine.dispose()
            return False

        date_limits, major_ticks, minor_ticks = create_daily_ticks(
            twoplot_config.days_to_plot)

        if newest_data_point > twoplot_config.last_data_date:

            runs_with_medians = (gc_session.query(GcRun).filter(
                GcRun.median != None).filter(GcRun.standard_rsd < .02).filter(
                    GcRun.rsd < .02).order_by(GcRun.date).all())

            gc_dates = [run.date for run in runs_with_medians]
            gc_ch4 = [run.median for run in runs_with_medians]

            picarro_data = (picarro_session.query(
                Datum.date, Datum.ch4).filter((Datum.mpv_position == 0) | (
                    Datum.mpv_position == 1)).filter(
                        (Datum.instrument_status == 963),
                        (Datum.alarm_status == 0)).filter(
                            Datum.date >= date_limits['left']).all()
                            )  # grab only data that falls in plotting period

            picarro_dates = [p.date for p in picarro_data]
            picarro_ch4 = [p.ch4 for p in picarro_data]

            with TempDir(methane_dir / 'plots'):
                name = summit_methane_plot(
                    None, {
                        'Summit Methane [Picarro]':
                        [picarro_dates, picarro_ch4],
                        'Summit Methane [GC]': [gc_dates, gc_ch4]
                    },
                    title='Summit Methane [Picarro & GC]',
                    limits={
                        'bottom': 1850,
                        'top': 2050,
                        'right': date_limits.get('right', None),
                        'left': date_limits.get('left', None)
                    },
                    major_ticks=major_ticks,
                    minor_ticks=minor_ticks)

                methane_plot = Plot(methane_dir / 'plots' / name, remotedir,
                                    True)  # stage plots to be uploaded
                add_or_ignore_plot(methane_plot, core_session)

                twoplot_config.last_data_date = newest_data_point
                core_session.merge(twoplot_config)

            logger.info('New data plots created.')
        else:
            logger.info('No new data found to be plotted.')

        gc_session.close()
        gc_engine.dispose()

        picarro_session.close()
        picarro_engine.dispose()

        core_session.commit()

        core_session.close()
        core_engine.dispose()
        return True

    except Exception as e:
        logger.error(f'Exception {e.args} occurred in dual_plot_methane()')
        send_processor_email(PROC, exception=e)

        core_session.close()
        core_engine.dispose()

        gc_session.close()
        gc_engine.dispose()

        picarro_session.close()
        picarro_engine.dispose()
        return False
예제 #4
0
async def update_excel_sheet(logger):
    """
    This checks for new GcRuns since it was last ran and creates a DataFrame containing run information that's appended
    to a spreadsheet on the Z-drive. This sheet is filled out by whoever does the manual integration, and is later read
    by TODO - I haven't written that yet
    to bring the updated peak areas back into the database and re-calculate mixing ratios.

    :param logger: logging logger for info and failures
    :return: bool, True if ran, False if errored
    """
    logger.info('Running update_excel_sheet()')

    try:
        import pandas as pd
        from datetime import datetime

        from summit_core import methane_dir as rundir
        from summit_errors import send_processor_warning

        from summit_methane import GcRun, Base, add_formulas_and_format_sheet
        from summit_core import Config, connect_to_db, append_df_to_excel
        from summit_core import methane_dir, core_dir, data_file_paths

        methane_sheet = data_file_paths.get('methane_sheet', None)

        if not methane_sheet:
            logger.error(
                'Filepath for the methane integration sheet could not be retrieved.'
            )
            send_processor_warning(
                PROC, 'Filepath Error',
                '''The methane integration sheet filepath could not be retrieved. It should be listed
                                   as "methane_sheet" in file_locations.json in the core folder.'''
            )
            return False

    except ImportError as e:
        logger.error('ImportError occurred in update_excel_sheet()')
        send_processor_email(PROC, exception=e)
        return False

    try:
        engine, session = connect_to_db('sqlite:///summit_methane.sqlite',
                                        rundir)
        Base.metadata.create_all(engine)
    except Exception as e:
        logger.error(
            f'Exception {e.args} prevented connection to the database in update_excel_sheet()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        core_engine, core_session = connect_to_db(
            'sqlite:///summit_core.sqlite', core_dir)
        Config.__table__.create(core_engine, checkfirst=True)

        methane_sheet_config = core_session.query(Config).filter(
            Config.processor == 'methane_sheet').one_or_none()

        if not methane_sheet_config:
            methane_sheet_config = Config(processor='methane_sheet')
            # use all default values except processor on init
            core_session.add(methane_sheet_config)
            core_session.commit()

    except Exception as e:
        logger.error(
            f'Error {e.args} prevented connecting to the core database in update_excel_sheet()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        most_recent_gcrun = session.query(GcRun.date).order_by(
            GcRun.date.desc()).first()

        if not most_recent_gcrun:
            most_recent_gcrun = datetime(
                1900, 1, 1)  # default to a safely historic date
        else:
            most_recent_gcrun = most_recent_gcrun.date  # get date from tuple response

        # object list of all the runs past the most recent date
        new_runs = session.query(GcRun).filter(
            GcRun.date > methane_sheet_config.last_data_date).all()

        if new_runs:
            col_list = [
                'date', 'filename', 'peak1', 'peak2', 'mr1', 'mr2',
                'run_median', 'run_rsd', 'std_median', 'std_rsd'
            ]  # list of all columns needed in the dataframe

            master_df = pd.DataFrame(
                index=None,
                columns=col_list)  # frame an empty df for new run data

            for run in new_runs:
                df = pd.DataFrame(
                    index=range(1, 6),
                    columns=col_list)  # create a five-row block to add later
                df['date'][1] = run.date
                df['filename'][
                    1] = run.logfile.name  # add date and filename for this block

                # The below can copy peak information from the automatic integrations into the spreadsheet
                # peaks1 = [sample.peak for sample in run.samples if sample.sample_num in [0,2,4,6,8]]
                # peaks2 = [sample.peak for sample in run.samples if sample.sample_num in [1,3,5,7,9]]
                # df.loc[0:5, 'peak1'] = [(peak.pa if peak else None) for peak in peaks1]
                # df.loc[0:5, 'peak2'] = [(peak.pa if peak else None) for peak in peaks2]

                master_df = master_df.append(
                    df)  # append block to all new ones so far

            # TODO: Anything touching sheets need to be carefully made to catch inacessible files ######################
            append_df_to_excel(methane_sheet, master_df,
                               **{'index':
                                  False})  # add all new lines and save sheet
            add_formulas_and_format_sheet(
                methane_sheet
            )  # open sheet and add formulas where non-existent, format cols

            logger.info(
                'New GcRuns added to the automated integration spreadsheet.')

            methane_sheet_config.last_data_date = most_recent_gcrun
        else:
            logger.info(
                'No new GcRuns found to add to the automated integration spreadsheet.'
            )

        core_session.merge(methane_sheet_config)
        core_session.commit()

        session.close()
        engine.dispose()
        core_session.close()
        core_engine.dispose()
        return True

    except Exception as e:
        session.close()
        engine.dispose()
        core_session.close()
        core_engine.dispose()
        logger.error(f'Exception {e.args} occurred in update_excel_sheet()')
        send_processor_email(PROC, exception=e)
        return False
예제 #5
0
async def read_excel_sheet(logger):
    logger.info('Running update_excel_sheet()')

    try:
        import pandas as pd
        from datetime import datetime

        from summit_core import methane_dir as rundir
        from summit_errors import send_processor_warning

        from summit_methane import GcRun, Base, add_formulas_and_format_sheet
        from summit_core import Config, connect_to_db, append_df_to_excel
        from summit_core import methane_dir, core_dir, data_file_paths

        methane_sheet = data_file_paths.get('methane_sheet', None)

        if not methane_sheet:
            logger.error(
                'Filepath for the methane integration sheet could not be retrieved.'
            )
            send_processor_warning(
                PROC, 'Filepath Error',
                '''The methane integration sheet filepath could not be retrieved. It should be listed
                                   as "methane_sheet" in file_locations.json in the core folder.'''
            )
            return False

    except ImportError as e:
        logger.error('ImportError occurred in update_excel_sheet()')
        send_processor_email(PROC, exception=e)
        return False

    try:
        engine, session = connect_to_db('sqlite:///summit_methane.sqlite',
                                        rundir)
        Base.metadata.create_all(engine)
    except Exception as e:
        logger.error(
            f'Exception {e.args} prevented connection to the database in update_excel_sheet()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:
        core_engine, core_session = connect_to_db(
            'sqlite:///summit_core.sqlite', core_dir)
        Config.__table__.create(core_engine, checkfirst=True)

        methane_sheet_read_config = (core_session.query(Config).filter(
            Config.processor == 'methane_sheet_read').one_or_none())

        if not methane_sheet_read_config:
            methane_sheet_read_config = Config(processor='methane_sheet_read')
            # use all default values except processor on init
            core_session.add(methane_sheet_read_config)
            core_session.commit()

    except Exception as e:
        logger.error(
            f'Error {e.args} prevented connecting to the core database in update_excel_sheet()'
        )
        send_processor_email(PROC, exception=e)
        return False

    try:

        core_session.merge(methane_sheet_read_config)
        core_session.commit()

        session.close()
        engine.dispose()
        core_session.close()
        core_engine.dispose()
        return True

    except Exception as e:
        session.close()
        engine.dispose()
        core_session.close()
        core_engine.dispose()
        logger.error(f'Exception {e.args} occurred in update_excel_sheet()')
        send_processor_email(PROC, exception=e)
        return False