def gen_renewable_sql(t): #Hour GEOTHERMAL BIOMASS BIOGAS SMALL HYDRO WIND TOTAL SOLAR PV SOLAR THERMAL renewable_ddl = 'CREATE TABLE IF NOT EXISTS renewable (id PRIMARY KEY ASC, date TEXT, hour INT, geothermal INT, biomass INT, biogas INT, small_hydro INT, wind_total INT, solar_pv INT, solar_thermal INT, solar INT, UNIQUE(date, hour));' res = [renewable_ddl] for idx in range(1, 26): try: if idx in t['data']: row = t['data'][idx] date = t['date'] hour = int(row[0]) geothermal = int_or_none(row[1]) biomass = int_or_none(row[2]) biogas = int_or_none(row[3]) small_hydro = int_or_none(row[4]) wind_total = int_or_none(row[5]) if len(row) == 8: solar_pv = int_or_none(row[6]) solar_thermal = int_or_none(row[7]) sql = f'INSERT INTO renewable (date, hour, geothermal, biomass, biogas, small_hydro, wind_total, solar_pv, solar_thermal) VALUES ("{date}", {hour}, {geothermal}, {biomass}, {biogas}, {small_hydro}, {wind_total}, {solar_pv}, {solar_thermal});' else: solar = int_or_none(row[6]) sql = f'INSERT INTO renewable (date, hour, geothermal, biomass, biogas, small_hydro, wind_total, solar) VALUES ("{date}", {hour}, {geothermal}, {biomass}, {biogas}, {small_hydro}, {wind_total}, {solar});' res.append(sql) except Exception as e: log.error( logger, { "name": __name__, "method": "gen_renewable_sql", "src": "30_pars.py", "date": t['date'].strftime('%Y%m%d'), "idx": idx, "exception": str(e), }) return res
def gen_total_sql(t): #Hour RENEWABLES NUCLEAR THERMAL IMPORTS HYDRO total_ddl = 'CREATE TABLE IF NOT EXISTS total (id PRIMARY KEY ASC, date TEXT, hour INT, renewables INT, nuclear INT, thermal INT, imports INT, hydro INT, UNIQUE(date, hour));' res = [total_ddl] for idx in range(1, 26): try: if idx in t['data']: row = t['data'][idx] date = t['date'] hour = int(row[0]) renewables = int_or_none(row[1]) nuclear = int_or_none(row[2]) thermal = int_or_none(row[3]) imports = int_or_none(row[4]) hydro = int_or_none(row[5]) sql = f'INSERT INTO total (date, hour, renewables, nuclear, thermal, imports, hydro) VALUES ("{date}", {hour}, {renewables}, {nuclear}, {thermal}, {imports}, {hydro});' res.append(sql) except Exception as e: log.error( logger, { "name": __name__, "method": "gen_total_sql", "src": "30_pars.py", "date": t['date'].strftime('%Y%m%d'), "idx": idx, "exception": str(e), }) return res
def parse(logger, resource_name, input_files, input_dir, output_dir): failed_state = os.path.join(output_dir, 'failed.txt') failed_input_files = [] if os.path.exists(failed_state): with open(failed_state, 'r') as fh: failed_input_files = [l.rstrip().rstrip() for l in fh] s_failed_input_files = set(failed_input_files) s_input_files = set(input_files) s_unprocessed_files = s_input_files - s_failed_input_files unprocessed_files = sorted(list(s_unprocessed_files)) with open(failed_state, 'a') as fh: for f in unprocessed_files: try: yield parse_file(logger, resource_name, f, input_dir, output_dir) except Exception as e: fh.write("%s\n" % f) tb = traceback.format_exc() log.error( logger, { "src": resource_name, "action": "parse", "xml_file": f, "msg": "parse failed", "ERROR": "Failed to parse xml file", "exception": str(e), "trace": str(tb), })
def extract_table(date, s): # first line is header # second line is column names # remaining 24 lines are data lines = s.split('\n') header = lines[0].lstrip().rstrip() columns = [ s.replace(' ', '_') for s in list(filter(lambda x: len(x) > 0, lines[1].split('\t'))) ] data = [x.split() for x in lines[2:]][:24] datamap = {} for row in data: try: try: idx = int(row[0]) except: idx = int(rount(float(row[0]))) datamap[idx] = row except Exception as e: log.error( logger, { "name": __name__, "method": "extract_table", "src": "30_pars.py", "row": row, "raw_table": s, "exception": str(e), }) return { 'date': date, 'header': header, 'columns': columns, 'data': datamap, }
def _sani_by_type(t, v): if v == None: s_values.append("\"\"") else: if t == SqlTypeEnum.NULL: # should not have NULL types (these are converted to TEXT type # at the end of the initial scan) s_values.append("\"\"") elif t == SqlTypeEnum.TEXT: s_values.append(self.quote_identifier(v)) elif t == SqlTypeEnum.INTEGER: s_values.append(v) elif t == SqlTypeEnum.REAL: s_values.append(v) elif t == SqlTypeEnum.BLOB: s_values.append(base64.b64encode(v)) else: log.error( self.logger, { "name": __name__, "method": "sqlite_sanitize_values._sani_by_type", "column": str(c), "value": str(v), "type": str(t), "sql_types": str(self.sql_types), "ERROR": "Failed to sanitize value", })
def restore_from_s3(logger, feed, ed_path, service): """ Restore feed dist from an S3 bucket. It'd be easy if we could simply 'rclone' from the S3 service and have the entire bucket replicated here. I've not had any luck with that approach. Here's the brute force solution. Use the state files, '[xml|sql|db|save]/state.txt', to direct the download operations. """ chlogger = logger.getChild(__name__) url_tuples = s3_artifact_urls(chlogger, feed, ed_path, service) try: for (url, target) in url_tuples: r = requests.get(url) if r.status_code == 200: with open(target, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) logger.info( chlogger, { "name": __name__, "method": "restore_from_s3", "feed": feed, "path": ed_path, "service": service, "url": url, "target": target, "message": "Restore succeeded", }) else: log.error( chlogger, { "name": __name__, "method": "restore_from_s3", "feed": feed, "path": ed_path, "service": service, "url": url, "target": target, "ERROR": "Failed to retrieve artifact from S3", }) target_parts = os.path.splitext(target) if target_parts[1] == ".gz": subprocess.run("pigz -d %s" % target) # return downloaded urls yield url except Exception as e: log.critical( chlogger, { "name": __name__, "method": "restore_from_s3", "feed": feed, "path": ed_path, "service": service, "ERROR": "Failed to restore from S3", "exception": str(e) })
def parse_text_files(logger, resource_name, new_files, txt_dir, sql_dir): for f in new_files: try: yield parse_text_file(logger, resource_name, txt_dir, sql_dir, f) except Exception as e: log.error( logger, { "name": __name__, "method": "parse_text_files", "src": "30_pars.py", "resource": resource_name, "input": os.path.join(txt_dir, f), "sql_dir": sql_dir, "exception": str(e), })
def archive_to_s3(logger, feed, ed_path, service, bwlimit="100M"): """ Archive feed dist to an S3 bucket. """ chlogger = logger.getChild(__name__) feed_dir = os.path.join(ed_path, 'data', feed) dist_dir = os.path.join(feed_dir, 'dist') s3_dir = os.path.join('eap', 'energy-dashboard', 'data', feed) cmd = "rclone sync --bwlimit=%s --no-update-modtime --verbose %s/dist %s:%s" % ( bwlimit, feed_dir, service, s3_dir) log.info( chlogger, { "name": __name__, "method": "archive_to_s3", "feed": feed, "path": ed_path, "service": service, "s3_dir": s3_dir, "cmd": cmd, }) if not os.path.exists(dist_dir) \ or not os.path.exists(os.path.join(dist_dir, 'zip')) \ or not os.path.exists(os.path.join(dist_dir, 'db')): log.error( chlogger, { "name": __name__, "method": "archive_to_s3", "feed": feed, "path": ed_path, "dist_dir": dist_dir, "service": service, "s3_dir": s3_dir, "ERROR": "One of dist_dir|dist_dir/zip|dist_dir/db does not exist", }) sys.exit(1) return runyield([cmd], feed_dir)
def insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth, max_depth): chlogger = logger.getChild(__name__) db_name = gen_db_name(resource_name, depth) sql_file = os.path.join(sql_dir, sql_file_name) db_file = os.path.join(db_dir, db_name) if depth > max_depth: log.error(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "max_depth" : max_depth, "dbmgr" : str(dbmgr), "ERROR" :"insert sql_file failed, max_depth exceeded", }) return log.info(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "started", }) cnx = dbmgr.get(db_file) try: with open(sql_file, 'r') as sf: log.debug(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "started", }) cnx.executescript(sf.read()) log.debug(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "db_file" : db_file, "file_idx" : idx, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "message" : "completed", }) return sql_file_name except Exception as e: log.error(chlogger, { "name" : __name__, "src" : resource_name, "method" : "insert_file", "file_idx" : idx, "db_file" : db_file, "sql_file" : sql_file, "depth" : depth, "dbmgr" : str(dbmgr), "ERROR" : "insert sql_file failed", "exception": str(e), }) insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file, idx, depth+1, max_depth)
def download(logger, resource_name, delay, urls, state_file, path, ending=".zip"): """ urls : list of urls to download state_file : list of urls that have already been downloaded path : path to write downloaded files to """ chlogger = logger.getChild(__name__) downloaded = [] prev_downloaded = set() if os.path.exists(state_file): with open(state_file, "r") as f: prev_downloaded = set([line.rstrip() for line in f]) status = {'manifest': 0, 'filesystem': 0, 'downloaded': 0, 'error': 0} for url in urls: try: filename = filesystem.url2filename(url, ending=ending) if url in prev_downloaded: log.debug( chlogger, { "src": resource_name, "action": 'skip_download', "url": url, "file": filename, "msg": 'url exists in download manifest' }) status['manifest'] += 1 continue target_file = os.path.join(path, filename) if os.path.exists(target_file): log.debug( chlogger, { "src": resource_name, "action": 'skip_download', "url": url, "file": filename, "msg": 'file exists locally, updating manifest' }) # update the state_file with files that were found on disk downloaded.append(url) status['filesystem'] += 1 continue r = requests.get(url) if r.status_code == 200: with open(target_file, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) downloaded.append(url) status['downloaded'] += 1 log.debug( chlogger, { "src": resource_name, "action": 'download', "url": url, "file": filename }) else: log.error( chlogger, { "src": resource_name, "action": 'download', "url": url, "file": filename, "status_code": r.status_code, "ERROR": 'http_request_failed' }) except Exception as e: log.error( chlogger, { "src": resource_name, "action": 'download', "url": url, "ERROR": "http_request_failed", "exception": str(e), "traceback": str(tb=traceback.format_exc()) }) status['error'] += 1 # TODO: this is such a hack time.sleep(delay) # ensure that all files in the download directery are read only for f in filesystem.glob_dir(path, ending): os.chmod(os.path.join(path, f), S_IREAD | S_IRGRP | S_IROTH) log.info(chlogger, { \ "src" : resource_name, \ "action" : 'download', \ "url" : url, \ 'skipped_in_manifest' : status['manifest'], \ 'skipped_in_filesystem' : status['filesystem'], \ 'downloaded' : status['downloaded'], \ 'error' : status['error'], \ }) return downloaded
def run(logger, manifest, config): start_date = datetime.date(*manifest['start_date']) resource_name = manifest['name'] resource_url = manifest['url'] delay = manifest['download_delay_secs'] download_dir = config['working_dir'] txt_dir = config['source_dir'] state_file = config['state_file'] # sleep for N seconds in between downloads to meet caiso expected use requirements dates = xtime.range_pairs(xtime.day_range_to_today(start_date)) urls = list(web.generate_urls(logger, dates, resource_url)) log.debug( logger, { "name": __name__, "method": "run", "resource": resource_name, "url": resource_url, "delay": delay, "download_dir": download_dir, "state_file": state_file, "start_date": str(start_date), "urls_count": len(urls), }) # download .txt files downloaded_txt_urls = web.download(logger, resource_name, delay, urls, state_file, download_dir, ending='.txt') # copy .txt files to ./text dir and then # compress original .txt files to .zip files if not os.path.exists(txt_dir): log.debug( logger, { "name": __name__, "method": "run", "src": "10_down.py", "message": "created target txt dir: %s" % txt_dir, }) os.makedirs(txt_dir) # process downloaded .txt files data_files = glob.glob( os.path.join(download_dir, "*DailyRenewablesWatch.txt")) for tf in data_files: try: # remove write protections for .txt files os.chmod( os.path.join(download_dir, tf), S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH) # if the txt file is here, it needs to be copied to the ./txt dir fqtf = os.path.join(download_dir, tf) fqtf2 = os.path.join(txt_dir, tf) fqtfzip = os.path.join(download_dir, '%s.zip' % tf) if not os.path.exists(fqtf2): shutil.copyfile(fqtf, fqtf2) with zipfile.ZipFile(fqtfzip, 'w') as myzip: myzip.write(fqtfzip) # set .zip file to be read only os.chmod(fqtfzip, S_IREAD | S_IRGRP | S_IROTH) # remove the zip/.txt file as it's been copied to txt/.txt if os.path.exists(fqtf2) and os.path.exists(fqtfzip): os.remove(fqtf) log.debug( logger, { "name": __name__, "method": "run", "src": "10_down.py", "message": "zipped file: %s" % tf, }) except Exception as e: log.error( logger, { "name": __name__, "method": "run", "src": "10_down.py", "file": tf, "error": "failed to process file", "exception": str(e), }) # TODO: something is clobbering perms on the state file, so clobber it back os.chmod(os.path.join(download_dir, 'state.txt'), S_IWRITE | S_IWGRP | S_IWOTH | S_IREAD | S_IRGRP | S_IROTH) # final step state.update(downloaded_txt_urls, state_file)