示例#1
0
def main(cli_args=None):
    """Main function to implement command-line interface"""
    if cli_args is None:
        cli_args = sys.argv[1:]

    args = parse_args(cli_args)

    # Print defined parameters
    v = dict(vars(args))
    to_delete = [k for k in v if v[k] is None]
    for k in to_delete:
        v.pop(k, None)  # Remove undefined parameters
    v["version"] = pkg_resources.get_distribution("seaflowpy").version
    print "Defined parameters:"
    print json.dumps(v, indent=2)
    print ""

    # Find EVT files
    if args.evt_dir:
        files = evt.find_evt_files(args.evt_dir)
    elif args.s3:
        # Make sure configuration for s3 is ready to go
        config = conf.get_aws_config(s3_only=True)
        cloud = clouds.AWS(config.items("aws"))
        # Make sure try to access S3 up front to setup AWS credentials before
        # launching child processes.
        try:
            files = cloud.get_files(args.cruise)
            files = evt.parse_file_list(files)  # Only keep EVT files
        except botocore.exceptions.NoCredentialsError as e:
            print "Please configure aws first:"
            print "  $ conda install aws"
            print "  or"
            print "  $ pip install aws"
            print "  then"
            print "  $ aws configure"
            sys.exit(1)

    # Restrict length of file list with --limit
    if (not args.limit is None) and (args.limit > 0):
        files = files[:args.limit]

    filter_keys = ["notch1", "notch2", "width", "offset", "origin"]
    filter_options = dict((k, getattr(args, k)) for k in filter_keys)

    # Filter
    if args.twopass:
        filterer = filterevt.two_pass_filter
    else:
        filterer = filterevt.filter_evt_files
    filterer(files, args.cruise, filter_options, args.db,
             args.opp_dir, s3=args.s3, process_count=args.process_count,
             every=args.resolution)

    # Index
    if args.db:
        db.ensure_indexes(args.db)
示例#2
0
def filter_evt_files(files,
                     cruise,
                     filter_options,
                     dbpath,
                     opp_dir,
                     s3=False,
                     process_count=1,
                     every=10.0):
    """Filter a list of EVT files.

    Arguments arguments:
        files - paths to EVT files to filter
        cruise - cruise name
        filter_options - Dictionary of filter params
            (notch1, notch2, width, offset, origin)
        dbpath = SQLite3 db path
        opp_dir = Directory for output binary OPP files

    Keyword arguments:
        s3 - Get EVT data from S3
        process_count - number of worker processes to use
        every - Percent progress output resolution
    """
    o = {
        "file": None,  # fill in later
        "cruise": cruise,
        "process_count": process_count,
        "filter_options": filter_options,
        "every": every,
        "s3": s3,
        "cloud_config_items": None,
        "dbpath": dbpath,
        "opp_dir": opp_dir,
        "filter_id": None  # fill in later
    }

    if dbpath:
        dbdir = os.path.dirname(dbpath)
        if dbdir and not os.path.isdir(dbdir):
            util.mkdir_p(dbdir)
        db.ensure_tables(dbpath)
        o["filter_id"] = db.save_filter_params(dbpath, filter_options)

    if s3:
        config = conf.get_aws_config(s3_only=True)
        o["cloud_config_items"] = config.items("aws")

    if process_count > 1:
        # Create a pool of N worker processes
        pool = Pool(process_count)

        def mapper(worker, task_list):
            return pool.imap_unordered(worker, task_list)
    else:

        def mapper(worker, task_list):
            return imap(worker, task_list)

    evt_count = 0
    evt_signal_count = 0
    opp_count = 0
    files_ok = 0

    # Construct worker inputs
    inputs = []
    for f in files:
        inputs.append(copy.copy(o))
        inputs[-1]["file"] = f

    print ""
    print "Filtering %i EVT files. Progress every %i%% (approximately)" % \
        (len(files), every)

    t0 = time.time()

    last = 0  # Last progress milestone in increments of every
    evt_count_block = 0  # EVT particles in this block (between milestones)
    evt_signal_count_block = 0  # EVT noise filtered particles in this block
    opp_count_block = 0  # OPP particles in this block

    # Filter particles in parallel with process pool
    for i, res in enumerate(mapper(do_work, inputs)):
        evt_count_block += res["evt_count"]
        evt_signal_count_block += res["evt_signal_count"]
        opp_count_block += res["opp_count"]
        files_ok += 1 if res["ok"] else 0

        # Print progress periodically
        perc = float(i + 1) / len(files) * 100  # Percent completed
        # Round down to closest every%
        milestone = int(perc / every) * every
        if milestone > last:
            now = time.time()
            evt_count += evt_count_block
            evt_signal_count += evt_signal_count_block
            opp_count += opp_count_block
            ratio_signal_block = zerodiv(opp_count_block,
                                         evt_signal_count_block)
            ratio_block = zerodiv(opp_count_block, evt_count_block)
            msg = "File: %i/%i (%.02f%%)" % (i + 1, len(files), perc)
            msg += " Particles this block: %i / %i (%i) %.04f (%.04f) elapsed: %.2fs" % \
                (opp_count_block, evt_signal_count_block, evt_count_block,
                ratio_signal_block, ratio_block, now - t0)
            print msg
            sys.stdout.flush()
            last = milestone
            evt_count_block = 0
            evt_signal_count_block = 0
            opp_count_block = 0
    # If any particle count data is left, add it to totals
    if evt_count_block > 0:
        evt_count += evt_count_block
        evt_signal_count += evt_signal_count_block
        opp_count += opp_count_block

    opp_evt_signal_ratio = zerodiv(opp_count, evt_signal_count)
    opp_evt_ratio = zerodiv(opp_count, evt_count)

    t1 = time.time()
    delta = t1 - t0
    evtrate = zerodiv(evt_count, delta)
    evtsignalrate = zerodiv(evt_signal_count, delta)
    opprate = zerodiv(opp_count, delta)

    print ""
    print "Input EVT files = %i" % len(files)
    print "Parsed EVT files = %i" % files_ok
    print "EVT particles = %s (%.2f p/s)" % (evt_count, evtrate)
    print "EVT noise filtered particles = %s (%.2f p/s)" % (evt_signal_count,
                                                            evtsignalrate)
    print "OPP particles = %s (%.2f p/s)" % (opp_count, opprate)
    print "OPP/EVT ratio = %.04f (%.04f)" % (opp_evt_signal_ratio,
                                             opp_evt_ratio)
    print "Filtering completed in %.2f seconds" % (delta, )
def main(cli_args=None):
    """Main function to implement command-line interface"""
    if cli_args is None:
        cli_args = sys.argv[1:]

    args = parse_args(cli_args)

    print "Started at {}".format(datetime.datetime.utcnow().isoformat())

    # Print defined parameters
    v = dict(vars(args))
    to_delete = [k for k in v if v[k] is None]
    for k in to_delete:
        v.pop(k, None)  # Remove undefined parameters
    v["version"] = pkg_resources.get_distribution("seaflowpy").version
    print "Defined parameters:"
    print json.dumps(v, indent=2)
    print ""

    # Make sure configuration for aws and ssh is ready to go
    config = conf.get_aws_config()
    conf.get_ssh_config(config)
    cloud = clouds.AWS(config.items("aws"))

    # Configure fabric
    env.connection_attempts = 6
    # Tell fabric the SSH user name and key file location
    env.user = config.get("ssh", "user")
    env.key_filename = os.path.expanduser(config.get("ssh", "ssh-key-file"))

    try:
        print "Getting lists of files for each cruise"
        cruise_files = {}

        # Handle case where cruises are listed in a file
        if len(args.cruises) == 1 and os.path.isfile(args.cruises[0]):
            with open(args.cruises[0]) as fh:
                args.cruises = fh.read().split()
        try:
            for c in args.cruises:
                cruise_files[c] = cloud.get_files(c)
                print "{:<20} {}".format(c, len(cruise_files[c]))
            print ""
        except botocore.exceptions.NoCredentialsError as e:
            print "Please configure aws first:"
            print "  $ conda install aws"
            print "  or"
            print "  $ pip install aws"
            print "  then"
            print "  $ aws configure"
            sys.exit(1)

        if args.dryrun:
            # Create dummy host list
            print "Creating {} dummy hosts".format(args.instance_count)
            env.hosts = ["dummy{}".format(i) for i in range(args.instance_count)]
        else:
            print "Starting {} instances".format(args.instance_count)
            result = cloud.start(
                count=args.instance_count,
                instance_type=args.instance_type
            )
            for iid, iip in zip(result["InstanceIds"], result["publicips"]):
                print "  InstanceId = {}, IP = {}".format(iid, iip)
            env.hosts.extend(result["publicips"])
        print ""

        # Fairly divide cruises into hosts based on number of files
        print "Assigning cruises to {} hosts".format(len(env.hosts))
        host_assignments = assign_keys_to_hosts(env.hosts, cruise_files)
        for h in host_assignments:
            htotal = sum([c[1] for c in host_assignments[h]])
            print "{:<20} {}".format(h, htotal)
            for c in host_assignments[h]:
                print "  {:<18} {}".format(c[0], c[1])
        print ""

        if args.dryrun:
            print "Dry run complete"
            print ""
            return


        print "Waiting for hosts to come up with SSH"
        execute(wait_for_up)

        print "Transfer AWS credentials"
        with hide("output"):
            execute(rsync_put, "~/.aws/", ".aws")

        print "Transfer seaflowpy configuration"
        with hide("output"):
            execute(rsync_put, "~/.seaflowpy/", ".seaflowpy")

        print "Install seaflowpy"
        execute(pull_seaflowpy)

        # Host list in env.hosts should be populated now and all machines up
        print "Filter data"
        execute(filter_cruise, host_assignments, args.output_dir, args.process_count)
    finally:
        disconnect_all()  # always disconnect SSH connections
        if not args.nocleanup:
            cloud.cleanup()  # clean up in case of any unhandled exceptions
        print "Finished at {}".format(datetime.datetime.utcnow().isoformat())
示例#4
0
def filter_evt_files(files, cruise, filter_options, dbpath, opp_dir, s3=False,
                     process_count=1, every=10.0):
    """Filter a list of EVT files.

    Arguments arguments:
        files - paths to EVT files to filter
        cruise - cruise name
        filter_options - Dictionary of filter params
            (notch1, notch2, width, offset, origin)
        dbpath = SQLite3 db path
        opp_dir = Directory for output binary OPP files

    Keyword arguments:
        s3 - Get EVT data from S3
        process_count - number of worker processes to use
        every - Percent progress output resolution
    """
    o = {
        "file": None,  # fill in later
        "cruise": cruise,
        "process_count": process_count,
        "filter_options": filter_options,
        "every": every,
        "s3": s3,
        "cloud_config_items": None,
        "dbpath": dbpath,
        "opp_dir": opp_dir,
        "filter_id": None  # fill in later
    }

    if dbpath:
        dbdir = os.path.dirname(dbpath)
        if dbdir and not os.path.isdir(dbdir):
            util.mkdir_p(dbdir)
        db.ensure_tables(dbpath)
        o["filter_id"] = db.save_filter_params(dbpath, filter_options)

    if s3:
        config = conf.get_aws_config(s3_only=True)
        o["cloud_config_items"] = config.items("aws")

    if process_count > 1:
        # Create a pool of N worker processes
        pool = Pool(process_count)
        def mapper(worker, task_list):
            return pool.imap_unordered(worker, task_list)
    else:
        def mapper(worker, task_list):
            return imap(worker, task_list)

    evt_count = 0
    evt_signal_count = 0
    opp_count = 0
    files_ok = 0

    # Construct worker inputs
    inputs = []
    for f in files:
        inputs.append(copy.copy(o))
        inputs[-1]["file"] = f

    print ""
    print "Filtering %i EVT files. Progress every %i%% (approximately)" % \
        (len(files), every)

    t0 = time.time()

    last = 0  # Last progress milestone in increments of every
    evt_count_block = 0  # EVT particles in this block (between milestones)
    evt_signal_count_block = 0  # EVT noise filtered particles in this block
    opp_count_block = 0  # OPP particles in this block

    # Filter particles in parallel with process pool
    for i, res in enumerate(mapper(do_work, inputs)):
        evt_count_block += res["evt_count"]
        evt_signal_count_block += res["evt_signal_count"]
        opp_count_block += res["opp_count"]
        files_ok += 1 if res["ok"] else 0

        # Print progress periodically
        perc = float(i + 1) / len(files) * 100  # Percent completed
        # Round down to closest every%
        milestone = int(perc / every) * every
        if milestone > last:
            now = time.time()
            evt_count += evt_count_block
            evt_signal_count += evt_signal_count_block
            opp_count += opp_count_block
            ratio_signal_block = zerodiv(opp_count_block, evt_signal_count_block)
            ratio_block = zerodiv(opp_count_block, evt_count_block)
            msg = "File: %i/%i (%.02f%%)" % (i + 1, len(files), perc)
            msg += " Particles this block: %i / %i (%i) %.04f (%.04f) elapsed: %.2fs" % \
                (opp_count_block, evt_signal_count_block, evt_count_block,
                ratio_signal_block, ratio_block, now - t0)
            print msg
            sys.stdout.flush()
            last = milestone
            evt_count_block = 0
            evt_signal_count_block = 0
            opp_count_block = 0
    # If any particle count data is left, add it to totals
    if evt_count_block > 0:
        evt_count += evt_count_block
        evt_signal_count += evt_signal_count_block
        opp_count += opp_count_block

    opp_evt_signal_ratio = zerodiv(opp_count, evt_signal_count)
    opp_evt_ratio = zerodiv(opp_count, evt_count)

    t1 = time.time()
    delta = t1 - t0
    evtrate = zerodiv(evt_count, delta)
    evtsignalrate = zerodiv(evt_signal_count, delta)
    opprate = zerodiv(opp_count, delta)

    print ""
    print "Input EVT files = %i" % len(files)
    print "Parsed EVT files = %i" % files_ok
    print "EVT particles = %s (%.2f p/s)" % (evt_count, evtrate)
    print "EVT noise filtered particles = %s (%.2f p/s)" % (evt_signal_count, evtsignalrate)
    print "OPP particles = %s (%.2f p/s)" % (opp_count, opprate)
    print "OPP/EVT ratio = %.04f (%.04f)" % (opp_evt_signal_ratio, opp_evt_ratio)
    print "Filtering completed in %.2f seconds" % (delta,)