def insert(logger, resource_name, sql_dir, db_dir, new_files):
    chlogger = logger.getChild(__name__)
    with DbMgr(chlogger, resource_name) as dbmgr:
        new_files_count = len(new_files)
        if not os.path.exists(db_dir):
            os.makedirs(db_dir)
        if not os.path.exists(db_dir):
            raise Exception("Failed to create db_dir: %s" % db_dir)
        log.info(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert",
            "sql_dir"   : sql_dir,
            "db_dir"    : db_dir,
            "new_files" : new_files_count,
            })
        for (idx, sql_file_name) in enumerate(new_files):
            yield insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth=0, max_depth=5)
       
        save_dir        = os.path.join(os.path.dirname(db_dir), "save")
        save_state_file = os.path.join(save_dir, "state.txt")
        db_files        = filesystem.glob_dir(db_dir, ".db")
        with open(save_state_file, 'w') as f:
            for dbf in db_files:
                f.write("%s\n" % dbf)
                log.debug(logger, {
                    "name"      : __name__,
                    "method"    : "insert",
                    "resource"  : resource_name,
                    "db_file"   : dbf,
                    "state_file": save_state_file,
                    "message"   : "added db_file to state file",
                    })
def run(logger, manifest, config):
    resource_name = manifest['name']
    sql_dir = config['source_dir']
    db_dir = config['working_dir']
    state_file = config['state_file']
    new_files = state.new_files(resource_name, state_file, sql_dir, '.sql')
    log.info(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "sql_dir": sql_dir,
            "db_dir": db_dir,
            "state_file": state_file,
            "new_files_count": len(new_files),
            "message": "started processing sql files",
        })
    state.update(db.insert(logger, resource_name, sql_dir, db_dir, new_files),
                 state_file)
    log.info(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "sql_dir": sql_dir,
            "db_dir": db_dir,
            "state_file": state_file,
            "new_files_count": len(new_files),
            "message": "finished processing sql files",
        })
Пример #3
0
def parse_file(logger, resource_name, xml_input_file_name, input_dir,
               output_dir):
    chlogger = logger.getChild(__name__)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    (base, ext) = os.path.splitext(xml_input_file_name)
    outfile = os.path.join(output_dir, "%s.sql" % (base))
    infile = os.path.join(input_dir, xml_input_file_name)
    total_ddl = 0
    total_sql = 0
    with open(outfile, 'w') as outfh:
        with open(infile, 'r') as infh:
            # all the work happens here
            xst = XML2SQLTransormer(chlogger, infh).parse().scan_all()
            # check that the ddl and sql is correct
            # if this fails then it means the ddl/sql combination is incorrect
            sqllst = []
            for ddl in xst.ddl():
                sqllst.append(ddl)
            for sql in xst.insertion_sql():
                sqllst.append(sql)
            sqltext = "\n".join(sqllst)
            #db = sqlite3.connect("file::memory:?cache=shared")
            db = sqlite3.connect(":memory:")
            db.executescript(sqltext)
            # all good
            outfh.write(sqltext)
    log.info(
        chlogger, {
            "src": resource_name,
            "action": "parse_file",
            "infile": infile,
            "outfile": outfile,
        })
    return xml_input_file_name
Пример #4
0
def git_add_and_commit(logger, resource):
    """
    """
    for cmd in ["git add *", "git commit -am 'update state'", "git push"]:
        proc = subprocess.run(cmd,
                              cwd=os.curdir,
                              shell=True,
                              stderr=subprocess.STDOUT)
        log.info(
            logger, {
                "name": __name__,
                "method": "git_add_and_commit",
                "resource": resource,
                "cmd": cmd,
                "stdout": proc.stdout,
                "stderr": proc.stderr,
                "returncode": proc.returncode,
            })
def run(logger, manifest, config):
    resource_name           = manifest['name']
    wasabi_bandwidth_limit  = config['wasabi_bwlimit']
    digitalocean_bandwidth_limit = config['digitalocean_bwlimit']
    log.info(logger, {
        "name"      : __name__,
        "method"    : "run",
        "resource"  : resource_name,
        "message"   : "archiving...",
        })
    ed_path = os.path.dirname(os.path.dirname(os.path.abspath(os.curdir)))
    for output in clifeed.archive_to_s3(logger, resource_name, ed_path, "wasabi", wasabi_bandwidth_limit):
        log.info(logger, {
            "name"      : __name__,
            "method"    : "run",
            "resource"  : resource_name,
            "service"   : "wasabi",
            "stdout"   : str(output),
            })
    for output in clifeed.archive_to_s3(logger, resource_name, ed_path, "digitalocean", digitalocean_bandwidth_limit):
        log.info(logger, {
            "name"      : __name__,
            "method"    : "run",
            "resource"  : resource_name,
            "service"   : "digitalocean",
            "stdout"   : str(output),
            })
    shutil.rmtree(os.path.join(os.curdir, 'dist'))
def archive_to_s3(logger, feed, ed_path, service, bwlimit="100M"):
    """
    Archive feed dist to an S3 bucket.
    """
    chlogger = logger.getChild(__name__)
    feed_dir = os.path.join(ed_path, 'data', feed)
    dist_dir = os.path.join(feed_dir, 'dist')
    s3_dir = os.path.join('eap', 'energy-dashboard', 'data', feed)
    cmd = "rclone sync --bwlimit=%s --no-update-modtime --verbose %s/dist %s:%s" % (
        bwlimit, feed_dir, service, s3_dir)
    log.info(
        chlogger, {
            "name": __name__,
            "method": "archive_to_s3",
            "feed": feed,
            "path": ed_path,
            "service": service,
            "s3_dir": s3_dir,
            "cmd": cmd,
        })
    if not os.path.exists(dist_dir) \
            or not os.path.exists(os.path.join(dist_dir, 'zip')) \
            or not os.path.exists(os.path.join(dist_dir, 'db')):
        log.error(
            chlogger, {
                "name": __name__,
                "method": "archive_to_s3",
                "feed": feed,
                "path": ed_path,
                "dist_dir": dist_dir,
                "service": service,
                "s3_dir": s3_dir,
                "ERROR":
                "One of dist_dir|dist_dir/zip|dist_dir/db does not exist",
            })
        sys.exit(1)
    return runyield([cmd], feed_dir)
Пример #7
0
def run(logger, manifest, config):
    resource_name = manifest['name']
    db_dir = config['source_dir']
    save_dir = config['working_dir']
    state_file = config['state_file']

    log.info(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "db_dir": db_dir,
            "save_dir": db_dir,
            "state_file": state_file,
            "message": "started saving state",
        })

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        log.info(
            logger, {
                "name": __name__,
                "method": "run",
                "resource": resource_name,
                "db_dir": db_dir,
                "save_dir": db_dir,
                "state_file": state_file,
                "message": "created save dir",
            })

    save.git_add_and_commit(logger, resource_name)

    log.info(
        logger, {
            "name": __name__,
            "method": "run",
            "resource": resource_name,
            "db_dir": db_dir,
            "save_dir": db_dir,
            "state_file": state_file,
            "message": "finished saving state",
        })
def insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file_name, idx, depth, max_depth):
    chlogger    = logger.getChild(__name__)
    db_name     = gen_db_name(resource_name, depth)
    sql_file    = os.path.join(sql_dir, sql_file_name)
    db_file     = os.path.join(db_dir, db_name)
    if depth > max_depth:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "db_file"   : db_file,
            "file_idx"  : idx,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "max_depth" : max_depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     :"insert sql_file failed, max_depth exceeded",
            })
        return

    log.info(chlogger, {
        "name"      : __name__,
        "src"       : resource_name,
        "method"    : "insert_file",
        "db_file"   : db_file,
        "file_idx"  : idx,
        "sql_file"  : sql_file,
        "depth"     : depth,
        "dbmgr"     : str(dbmgr),
        "message"   : "started",
        })
        
    cnx = dbmgr.get(db_file)
    try:
        with open(sql_file, 'r') as sf:
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "started",
                })
            cnx.executescript(sf.read())
            log.debug(chlogger, {
                "name"      : __name__,
                "src"       : resource_name,
                "method"    : "insert_file",
                "db_file"   : db_file,
                "file_idx"  : idx,
                "sql_file"  : sql_file,
                "depth"     : depth,
                "dbmgr"     : str(dbmgr),
                "message"   : "completed",
                })
        return sql_file_name
    except Exception as e:
        log.error(chlogger, {
            "name"      : __name__,
            "src"       : resource_name,
            "method"    : "insert_file",
            "file_idx"  : idx,
            "db_file"   : db_file,
            "sql_file"  : sql_file,
            "depth"     : depth,
            "dbmgr"     : str(dbmgr),
            "ERROR"     : "insert sql_file failed",
            "exception": str(e),
            })
        insert_file(logger, resource_name, dbmgr, sql_dir, db_dir, sql_file, idx, depth+1, max_depth)
Пример #9
0
def download(logger,
             resource_name,
             delay,
             urls,
             state_file,
             path,
             ending=".zip"):
    """
    urls        : list of urls to download
    state_file  : list of urls that have already been downloaded
    path        : path to write downloaded files to
    """
    chlogger = logger.getChild(__name__)
    downloaded = []
    prev_downloaded = set()
    if os.path.exists(state_file):
        with open(state_file, "r") as f:
            prev_downloaded = set([line.rstrip() for line in f])

    status = {'manifest': 0, 'filesystem': 0, 'downloaded': 0, 'error': 0}

    for url in urls:
        try:
            filename = filesystem.url2filename(url, ending=ending)
            if url in prev_downloaded:
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'url exists in download manifest'
                    })
                status['manifest'] += 1
                continue
            target_file = os.path.join(path, filename)
            if os.path.exists(target_file):
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'skip_download',
                        "url": url,
                        "file": filename,
                        "msg": 'file exists locally, updating manifest'
                    })
                # update the state_file with files that were found on disk
                downloaded.append(url)
                status['filesystem'] += 1
                continue
            r = requests.get(url)
            if r.status_code == 200:
                with open(target_file, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                downloaded.append(url)
                status['downloaded'] += 1
                log.debug(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename
                    })
            else:
                log.error(
                    chlogger, {
                        "src": resource_name,
                        "action": 'download',
                        "url": url,
                        "file": filename,
                        "status_code": r.status_code,
                        "ERROR": 'http_request_failed'
                    })
        except Exception as e:
            log.error(
                chlogger, {
                    "src": resource_name,
                    "action": 'download',
                    "url": url,
                    "ERROR": "http_request_failed",
                    "exception": str(e),
                    "traceback": str(tb=traceback.format_exc())
                })
            status['error'] += 1
        # TODO: this is such a hack
        time.sleep(delay)
        # ensure that all files in the download directery are read only
        for f in filesystem.glob_dir(path, ending):
            os.chmod(os.path.join(path, f), S_IREAD | S_IRGRP | S_IROTH)
        log.info(chlogger, {                                        \
                "src"                   : resource_name,            \
                "action"                : 'download',               \
                "url"                   : url,                      \
                'skipped_in_manifest'   : status['manifest'],       \
                'skipped_in_filesystem' : status['filesystem'],     \
                'downloaded'            : status['downloaded'],     \
                'error'                 : status['error'],          \
                })
    return downloaded