def gen_renewable_sql(t):
    #Hour		GEOTHERMAL	BIOMASS		BIOGAS		SMALL HYDRO	WIND TOTAL	SOLAR PV	SOLAR THERMAL
    renewable_ddl = 'CREATE TABLE IF NOT EXISTS renewable (id PRIMARY KEY ASC, date TEXT, hour INT, geothermal INT, biomass INT, biogas INT, small_hydro INT, wind_total INT, solar_pv INT, solar_thermal INT, solar INT, UNIQUE(date, hour));'
    res = [renewable_ddl]
    for idx in range(1, 26):
        try:
            if idx in t['data']:
                row = t['data'][idx]
                date = t['date']
                hour = int(row[0])
                geothermal = int_or_none(row[1])
                biomass = int_or_none(row[2])
                biogas = int_or_none(row[3])
                small_hydro = int_or_none(row[4])
                wind_total = int_or_none(row[5])
                if len(row) == 8:
                    solar_pv = int_or_none(row[6])
                    solar_thermal = int_or_none(row[7])
                    sql = f'INSERT INTO renewable (date, hour, geothermal, biomass, biogas, small_hydro, wind_total, solar_pv, solar_thermal) VALUES ("{date}", {hour}, {geothermal}, {biomass}, {biogas}, {small_hydro}, {wind_total}, {solar_pv}, {solar_thermal});'
                else:
                    solar = int_or_none(row[6])
                    sql = f'INSERT INTO renewable (date, hour, geothermal, biomass, biogas, small_hydro, wind_total, solar) VALUES ("{date}", {hour}, {geothermal}, {biomass}, {biogas}, {small_hydro}, {wind_total}, {solar});'
                res.append(sql)
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "gen_renewable_sql",
                    "src": "30_pars.py",
                    "date": t['date'].strftime('%Y%m%d'),
                    "idx": idx,
                    "exception": str(e),
                })
    return res
def gen_total_sql(t):
    #Hour		RENEWABLES	NUCLEAR		THERMAL		IMPORTS		HYDRO
    total_ddl = 'CREATE TABLE IF NOT EXISTS total (id PRIMARY KEY ASC, date TEXT, hour INT, renewables INT, nuclear INT, thermal INT, imports INT, hydro INT, UNIQUE(date, hour));'
    res = [total_ddl]
    for idx in range(1, 26):
        try:
            if idx in t['data']:
                row = t['data'][idx]
                date = t['date']
                hour = int(row[0])
                renewables = int_or_none(row[1])
                nuclear = int_or_none(row[2])
                thermal = int_or_none(row[3])
                imports = int_or_none(row[4])
                hydro = int_or_none(row[5])
                sql = f'INSERT INTO total (date, hour, renewables, nuclear, thermal, imports, hydro) VALUES ("{date}", {hour}, {renewables}, {nuclear}, {thermal}, {imports}, {hydro});'
                res.append(sql)
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "gen_total_sql",
                    "src": "30_pars.py",
                    "date": t['date'].strftime('%Y%m%d'),
                    "idx": idx,
                    "exception": str(e),
                })
    return res
예제 #3
0
def parse(logger, resource_name, input_files, input_dir, output_dir):
    failed_state = os.path.join(output_dir, 'failed.txt')
    failed_input_files = []

    if os.path.exists(failed_state):
        with open(failed_state, 'r') as fh:
            failed_input_files = [l.rstrip().rstrip() for l in fh]

    s_failed_input_files = set(failed_input_files)
    s_input_files = set(input_files)
    s_unprocessed_files = s_input_files - s_failed_input_files
    unprocessed_files = sorted(list(s_unprocessed_files))

    with open(failed_state, 'a') as fh:
        for f in unprocessed_files:
            try:
                yield parse_file(logger, resource_name, f, input_dir,
                                 output_dir)
            except Exception as e:
                fh.write("%s\n" % f)
                tb = traceback.format_exc()
                log.error(
                    logger, {
                        "src": resource_name,
                        "action": "parse",
                        "xml_file": f,
                        "msg": "parse failed",
                        "ERROR": "Failed to parse xml file",
                        "exception": str(e),
                        "trace": str(tb),
                    })
def extract_table(date, s):
    # first line is header
    # second line is column names
    # remaining 24 lines are data
    lines = s.split('\n')
    header = lines[0].lstrip().rstrip()
    columns = [
        s.replace(' ', '_')
        for s in list(filter(lambda x: len(x) > 0, lines[1].split('\t')))
    ]
    data = [x.split() for x in lines[2:]][:24]
    datamap = {}
    for row in data:
        try:
            try:
                idx = int(row[0])
            except:
                idx = int(rount(float(row[0])))
            datamap[idx] = row
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "extract_table",
                    "src": "30_pars.py",
                    "row": row,
                    "raw_table": s,
                    "exception": str(e),
                })
    return {
        'date': date,
        'header': header,
        'columns': columns,
        'data': datamap,
    }
예제 #5
0
 def _sani_by_type(t, v):
     if v == None:
         s_values.append("\"\"")
     else:
         if t == SqlTypeEnum.NULL:
             # should not have NULL types (these are converted to TEXT type
             # at the end of the initial scan)
             s_values.append("\"\"")
         elif t == SqlTypeEnum.TEXT:
             s_values.append(self.quote_identifier(v))
         elif t == SqlTypeEnum.INTEGER:
             s_values.append(v)
         elif t == SqlTypeEnum.REAL:
             s_values.append(v)
         elif t == SqlTypeEnum.BLOB:
             s_values.append(base64.b64encode(v))
         else:
             log.error(
                 self.logger, {
                     "name": __name__,
                     "method": "sqlite_sanitize_values._sani_by_type",
                     "column": str(c),
                     "value": str(v),
                     "type": str(t),
                     "sql_types": str(self.sql_types),
                     "ERROR": "Failed to sanitize value",
                 })
def restore_from_s3(logger, feed, ed_path, service):
    """
    Restore feed dist from an S3 bucket.

    It'd be easy if we could simply 'rclone' from the S3 service and
    have the entire bucket replicated here. I've not had any luck with
    that approach.

    Here's the brute force solution. Use the state files,
    '[xml|sql|db|save]/state.txt', to direct the download operations.  
    """
    chlogger = logger.getChild(__name__)
    url_tuples = s3_artifact_urls(chlogger, feed, ed_path, service)
    try:
        for (url, target) in url_tuples:
            r = requests.get(url)
            if r.status_code == 200:
                with open(target, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                logger.info(
                    chlogger, {
                        "name": __name__,
                        "method": "restore_from_s3",
                        "feed": feed,
                        "path": ed_path,
                        "service": service,
                        "url": url,
                        "target": target,
                        "message": "Restore succeeded",
                    })
            else:
                log.error(
                    chlogger, {
                        "name": __name__,
                        "method": "restore_from_s3",
                        "feed": feed,
                        "path": ed_path,
                        "service": service,
                        "url": url,
                        "target": target,
                        "ERROR": "Failed to retrieve artifact from S3",
                    })
                target_parts = os.path.splitext(target)
                if target_parts[1] == ".gz":
                    subprocess.run("pigz -d %s" % target)
            # return downloaded urls
            yield url
    except Exception as e:
        log.critical(
            chlogger, {
                "name": __name__,
                "method": "restore_from_s3",
                "feed": feed,
                "path": ed_path,
                "service": service,
                "ERROR": "Failed to restore from S3",
                "exception": str(e)
            })
def parse_text_files(logger, resource_name, new_files, txt_dir, sql_dir):
    for f in new_files:
        try:
            yield parse_text_file(logger, resource_name, txt_dir, sql_dir, f)
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "parse_text_files",
                    "src": "30_pars.py",
                    "resource": resource_name,
                    "input": os.path.join(txt_dir, f),
                    "sql_dir": sql_dir,
                    "exception": str(e),
                })
def archive_to_s3(logger, feed, ed_path, service, bwlimit="100M"):
    """
    Archive feed dist to an S3 bucket.
    """
    chlogger = logger.getChild(__name__)
    feed_dir = os.path.join(ed_path, 'data', feed)
    dist_dir = os.path.join(feed_dir, 'dist')
    s3_dir = os.path.join('eap', 'energy-dashboard', 'data', feed)
    cmd = "rclone sync --bwlimit=%s --no-update-modtime --verbose %s/dist %s:%s" % (
        bwlimit, feed_dir, service, s3_dir)
    log.info(
        chlogger, {
            "name": __name__,
            "method": "archive_to_s3",
            "feed": feed,
            "path": ed_path,
            "service": service,
            "s3_dir": s3_dir,
            "cmd": cmd,
        })
    if not os.path.exists(dist_dir) \
            or not os.path.exists(os.path.join(dist_dir, 'zip')) \
            or not os.path.exists(os.path.join(dist_dir, 'db')):
        log.error(
            chlogger, {
                "name": __name__,
                "method": "archive_to_s3",
                "feed": feed,
                "path": ed_path,
                "dist_dir": dist_dir,
                "service": service,
                "s3_dir": s3_dir,
                "ERROR":
                "One of dist_dir|dist_dir/zip|dist_dir/db does not exist",
            })
        sys.exit(1)
    return runyield([cmd], feed_dir)
def insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth, max_depth):
    chlogger    = logger.getChild(__name__)
    db_name     = gen_db_name(resource_name, depth)
    sql_file    = os.path.join(sql_dir, sql_file_name)
    db_file     = os.path.join(db_dir, db_name)
    if depth > max_depth:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "db_file"   : db_file,
            "file_idx"  : idx,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "max_depth" : max_depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     :"insert sql_file failed, max_depth exceeded",
            })
        return

    log.info(chlogger, {
        "name"      : __name__,
        "src"       : resource_name,
        "method"    : "insert_file",
        "db_file"   : db_file,
        "file_idx"  : idx,
        "sql_file"  : sql_file,
        "depth"     : depth,
        "dbmgr"     : str(dbmgr),
        "message"   : "started",
        })
        
    cnx = dbmgr.get(db_file)
    try:
        with open(sql_file, 'r') as sf:
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "started",
                })
            cnx.executescript(sf.read())
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "completed",
                })
        return sql_file_name
    except Exception as e:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "file_idx"  : idx,
            "db_file"   : db_file,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     : "insert sql_file failed",
            "exception": str(e),
            })
        insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file, idx, depth+1, max_depth)
예제 #10
0
def download(logger,
             resource_name,
             delay,
             urls,
             state_file,
             path,
             ending=".zip"):
    """
    urls        : list of urls to download
    state_file  : list of urls that have already been downloaded
    path        : path to write downloaded files to
    """
    chlogger = logger.getChild(__name__)
    downloaded = []
    prev_downloaded = set()
    if os.path.exists(state_file):
        with open(state_file, "r") as f:
            prev_downloaded = set([line.rstrip() for line in f])

    status = {'manifest': 0, 'filesystem': 0, 'downloaded': 0, 'error': 0}

    for url in urls:
        try:
            filename = filesystem.url2filename(url, ending=ending)
            if url in prev_downloaded:
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'url exists in download manifest'
                    })
                status['manifest'] += 1
                continue
            target_file = os.path.join(path, filename)
            if os.path.exists(target_file):
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'file exists locally, updating manifest'
                    })
                # update the state_file with files that were found on disk
                downloaded.append(url)
                status['filesystem'] += 1
                continue
            r = requests.get(url)
            if r.status_code == 200:
                with open(target_file, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                downloaded.append(url)
                status['downloaded'] += 1
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename
                    })
            else:
                log.error(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename,
                        "status_code": r.status_code,
                        "ERROR": 'http_request_failed'
                    })
        except Exception as e:
            log.error(
                chlogger, {
                    "src": resource_name,
                    "action": 'download',
                    "url": url,
                    "ERROR": "http_request_failed",
                    "exception": str(e),
                    "traceback": str(tb=traceback.format_exc())
                })
            status['error'] += 1
        # TODO: this is such a hack
        time.sleep(delay)
        # ensure that all files in the download directery are read only
        for f in filesystem.glob_dir(path, ending):
            os.chmod(os.path.join(path, f), S_IREAD | S_IRGRP | S_IROTH)
        log.info(chlogger, {                                        \
                "src"                   : resource_name,            \
                "action"                : 'download',               \
                "url"                   : url,                      \
                'skipped_in_manifest'   : status['manifest'],       \
                'skipped_in_filesystem' : status['filesystem'],     \
                'downloaded'            : status['downloaded'],     \
                'error'                 : status['error'],          \
                })
    return downloaded
예제 #11
0
def run(logger, manifest, config):
    start_date = datetime.date(*manifest['start_date'])
    resource_name = manifest['name']
    resource_url = manifest['url']
    delay = manifest['download_delay_secs']
    download_dir = config['working_dir']
    txt_dir = config['source_dir']
    state_file = config['state_file']
    # sleep for N seconds in between downloads to meet caiso expected use requirements
    dates = xtime.range_pairs(xtime.day_range_to_today(start_date))
    urls = list(web.generate_urls(logger, dates, resource_url))
    log.debug(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "url": resource_url,
            "delay": delay,
            "download_dir": download_dir,
            "state_file": state_file,
            "start_date": str(start_date),
            "urls_count": len(urls),
        })

    # download .txt files
    downloaded_txt_urls = web.download(logger,
                                       resource_name,
                                       delay,
                                       urls,
                                       state_file,
                                       download_dir,
                                       ending='.txt')

    # copy .txt files to ./text dir and then
    # compress original .txt files to .zip files
    if not os.path.exists(txt_dir):
        log.debug(
            logger, {
                "name": __name__,
                "method": "run",
                "src": "10_down.py",
                "message": "created target txt dir: %s" % txt_dir,
            })
        os.makedirs(txt_dir)

    # process downloaded .txt files
    data_files = glob.glob(
        os.path.join(download_dir, "*DailyRenewablesWatch.txt"))
    for tf in data_files:
        try:
            # remove write protections for .txt files
            os.chmod(
                os.path.join(download_dir, tf),
                S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH)

            # if the txt file is here, it needs to be copied to the ./txt dir
            fqtf = os.path.join(download_dir, tf)
            fqtf2 = os.path.join(txt_dir, tf)
            fqtfzip = os.path.join(download_dir, '%s.zip' % tf)
            if not os.path.exists(fqtf2):
                shutil.copyfile(fqtf, fqtf2)
            with zipfile.ZipFile(fqtfzip, 'w') as myzip:
                myzip.write(fqtfzip)

            # set .zip file to be read only
            os.chmod(fqtfzip, S_IREAD | S_IRGRP | S_IROTH)

            # remove the zip/.txt file as it's been copied to txt/.txt
            if os.path.exists(fqtf2) and os.path.exists(fqtfzip):
                os.remove(fqtf)
            log.debug(
                logger, {
                    "name": __name__,
                    "method": "run",
                    "src": "10_down.py",
                    "message": "zipped file: %s" % tf,
                })
        except Exception as e:
            log.error(
                logger, {
                    "name": __name__,
                    "method": "run",
                    "src": "10_down.py",
                    "file": tf,
                    "error": "failed to process file",
                    "exception": str(e),
                })

    # TODO: something is clobbering perms on the state file, so clobber it back
    os.chmod(os.path.join(download_dir, 'state.txt'),
             S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH)
    # final step
    state.update(downloaded_txt_urls, state_file)