示例#1
0
def mount(name, path, user, address, hostname):
    """Interactive routine for mounting a storage node located at ROOT."""

    import socket

    # We need to write to the database.
    di.connect_database(read_write=True)

    try:
        node = di.StorageNode.get(name=name)
    except pw.DoesNotExist:
        print('Storage node "%s" does not exist. I quit.' % name)

    if node.mounted:
        print('Node "%s" is already mounted.' % name)
        return

    # Set the default hostname if required
    if hostname is None:
        hostname = socket.gethostname()
        print('I will set the host to "%s".' % hostname)

    # Set the parameters of this node
    node.username = user
    node.address = address
    node.mounted = True
    node.host = hostname

    if path is not None:
        node.root = path

    node.save()

    print('Successfully mounted "%s".' % name)
示例#2
0
def unmount(root_or_name):
    """Unmount a storage node with location or named ROOT_OR_NAME."""
    import os
    import socket

    # We need to write to the database.
    di.connect_database(read_write=True)

    try:
        node = di.StorageNode.get(name=root_or_name)
    except pw.DoesNotExist:
        if root_or_name[-1] == "/":
            root_or_name = root_or_name[:len(root_or_name) - 1]

        if not os.path.exists(root_or_name):
            print("That is neither a node name, nor a path on this host. "
                  "I quit.")
            exit()
        try:
            node = di.StorageNode.get(root=root_or_name,
                                      host=socket.gethostname())
        except pw.DoesNotExist:
            print("That is neither a node name nor a root name that is "
                  "known. I quit.")
            exit()

    if not node.mounted:
        print("There is no node mounted there any more.")
    else:
        node.mounted = False
        node.save()
        print("Node successfully unmounted.")
示例#3
0
def import_file(node, root, acq_name, file_name):
    done = False
    while not done:
        try:
            _import_file(node, root, acq_name, file_name)
            done = True
        except pw.OperationalError:
            log.error("MySQL connexion dropped. Will attempt to reconnect in "
                      "five seconds.")
            time.sleep(5)
            di.connect_database(True)
示例#4
0
def import_files(node_name, verbose, acq, dry):
    """Scan the current directory for known acquisition files and add them into the database for NODE.

    This command is useful for manually maintaining an archive where we can run
    alpenhornd in the usual manner.
    """
    import glob

    from ch_util import data_index as di

    di.connect_database(read_write=True)
    import peewee as pw

    # Construct list of acqs to scan
    if acq is None:
        acqs = glob.glob("*")
    else:
        acqs = acq

    # Keep track of state as we process the files
    added_files = []  # Files we have added to the database
    corrupt_files = []  # Known files which are corrupt
    registered_files = []  # Files already registered in the database
    unknown_files = []  # Files not known in the database
    not_acqs = []  # Directories which were not known acquisitions

    # Fetch a reference to the node
    try:
        node = di.StorageNode.select().where(
            di.StorageNode.name == node_name).get()
    except pw.DoesNotExist:
        print("Unknown node.")
        return

    with click.progressbar(acqs, label="Scanning acquisitions") as acq_iter:

        for acq_name in acq_iter:

            try:
                di.parse_acq_name(acq_name)
            except di.Validation:
                not_acqs.append(acq_name)
                continue

            try:
                acq = di.ArchiveAcq.select().where(
                    di.ArchiveAcq.name == acq_name).get()
            except pw.DoesNotExist:
                not_acqs.append(acq_name)
                continue

            files = glob.glob(acq_name + "/*")

            # Fetch lists of all files in this acquisition, and all
            # files in this acq with local copies
            file_names = [f.name for f in acq.files]
            local_file_names = [
                f.name for f in acq.files.join(di.ArchiveFileCopy).where(
                    di.ArchiveFileCopy.node == node)
            ]

            for fn in files:

                f_name = os.path.split(fn)[1]

                # Check if file exists in database
                if f_name not in file_names:
                    unknown_files.append(fn)
                    continue

                # Check if file is already registered on this node
                if f_name in local_file_names:
                    registered_files.append(fn)
                else:
                    archive_file = (di.ArchiveFile.select().where(
                        di.ArchiveFile.name == f_name,
                        di.ArchiveFile.acq == acq).get())

                    if os.path.getsize(fn) != archive_file.size_b:
                        corrupt_files.append(fn)
                        continue

                    added_files.append(fn)
                    if not dry:
                        di.ArchiveFileCopy.create(file=archive_file,
                                                  node=node,
                                                  has_file="Y",
                                                  wants_file="Y")

    print("\n==== Summary ====")
    print()
    print("Added %i files" % len(added_files))
    print()
    print("%i corrupt files." % len(corrupt_files))
    print("%i files already registered." % len(registered_files))
    print("%i files not known" % len(unknown_files))
    print("%i directories were not acquisitions." % len(not_acqs))

    if verbose > 0:
        print()
        print("Added files:")
        print()

        for fn in added_files:
            print(fn)

    if verbose > 1:

        print("Corrupt:")
        for fn in corrupt_files:
            print(fn)
        print()

        print("Unknown files:")
        for fn in unknown_files:
            print(fn)
        print()

        print("Unknown acquisitions:")
        for fn in not_acqs:
            print(fn)
        print()
示例#5
0
def format_transport(serial_num):
    """Interactive routine for formatting a transport disc as a storage
    node; formats and labels the disc as necessary, the adds to the
    database. The disk is specified using the manufacturers
    SERIAL_NUM, which is printed on the disk.
    """
    import glob
    import os

    if os.getuid() != 0:
        print("You must be root to run mount on a transport disc. I quit.")
        return

    # Find the disc.
    dev = glob.glob("/dev/disk/by-id/*%s" % serial_num)
    if len(dev) == 0:
        print("No disc with that serial number is attached.")
        return
    elif len(dev) > 1:
        print(
            "Confused: found more than one device matching that serial number:"
        )
        for d in dev:
            print("  %s" % dev)
        print("Aborting.")
        return
    dev = dev[0]
    dev_part = "%s-part1" % dev

    # Figure out if it is formatted.
    print("Checking to see if disc is formatted. Please wait.")
    fp = os.popen("parted -s %s print" % dev)
    formatted = False
    part_start = False
    while True:
        l = fp.readline()
        if not l:
            break
        if l.find("Number"
                  ) == 0 and l.find("Start") > 0 and l.find("File system") > 0:
            part_start = True
        elif l.strip() != "" and part_start:
            formatted = True
    fp.close()

    if not formatted:
        if not click.confirm("Disc is not formatted. Should I format it?"):
            return
        print("Creating partition. Please wait.")
        os.system(
            "parted -s -a optimal %s mklabel gpt -- mkpart primary 0%% 100%%" %
            dev)
        print("Formatting disc. Please wait.")
        os.system("mkfs.ext4 %s -m 0 -L CH-%s" % (dev_part, serial_num))
    else:
        print("Disc is already formatted.")

    e2label = get_e2label(dev_part)
    name = "CH-%s" % serial_num
    if e2label and e2label != name:
        print("Disc label %s does not conform to labelling standard, "
              "which is CH-<serialnum>.")
        exit
    elif not e2label:
        print('Labelling the disc as "%s" (using e2label) ...' % (name))
        assert dev_part is not None
        assert len(name) <= MAX_E2LABEL_LEN
        stat = os.system("/sbin/e2label %s %s" % (dev_part, name))
        if stat:
            print("Failed to e2label! Stat = %s. I quit." % (stat))
            exit()

    # Ensure the mount path exists.
    root = "/mnt/%s" % name
    if not os.path.isdir(root):
        print("Creating mount point %s." % root)
        os.mkdir(root)

    # Check to see if the disc is mounted.
    fp = os.popen("df")
    mounted = False
    dev_part_abs = os.path.realpath(dev_part)
    while 1:
        l = fp.readline()
        if not l:
            break
        if l.find(root) > 0:
            if l[:len(dev_part)] == dev or l[:len(dev_part_abs
                                                  )] == dev_part_abs:
                mounted = True
            else:
                print("%s is a mount point, but %s is already mounted there." (
                    root, l.split()[0]))
    fp.close()

    try:
        node = di.StorageNode.get(name=name)
    except pw.DoesNotExist:
        print("This disc has not been registered yet as a storage node. "
              "Registering now.")
        try:
            group = di.StorageGroup.get(name="transport")
        except pw.DoesNotExist:
            print('Hmmm. Storage group "transport" does not exist. I quit.')
            exit()

        # We need to write to the database.
        di.connect_database(read_write=True)
        node = di.StorageNode.create(name=name,
                                     root=root,
                                     group=group,
                                     storage_type="T",
                                     min_avail_gb=1)

        print("Successfully created storage node.")

    print(
        "Node created but not mounted. Run alpenhorn mount_transport for that."
    )
示例#6
0
def sync(node_name, group_name, acq, force, nice, target, transport, show_acq,
         show_files):
    """Copy all files from NODE to GROUP that are not already present.

    We can also use the --target option to only transfer files that are not
    available on both the destination group, and the TARGET_GROUP. This is
    useful for transferring data to a staging location before going to a final
    archive (e.g. HPSS, transport disks).
    """

    # Make sure we connect RW
    di.connect_database(read_write=True)

    try:
        from_node = di.StorageNode.get(name=node_name)
    except pw.DoesNotExist:
        raise Exception('Node "%s" does not exist in the DB.' % node_name)
    try:
        to_group = di.StorageGroup.get(name=group_name)
    except pw.DoesNotExist:
        raise Exception('Group "%s" does not exist in the DB.' % group_name)

    # Construct list of file copies that are available on the source node, and
    # not available on any nodes at the destination. This query is quite complex
    # so I've broken it up...

    # First get the nodes at the destination...
    nodes_at_dest = di.StorageNode.select().where(
        di.StorageNode.group == to_group)

    # Then use this to get a list of all files at the destination...
    files_at_dest = (di.ArchiveFile.select().join(di.ArchiveFileCopy).where(
        di.ArchiveFileCopy.node << nodes_at_dest,
        di.ArchiveFileCopy.has_file == "Y"))

    # Then combine to get all file(copies) that are available at the source but
    # not at the destination...
    copy = di.ArchiveFileCopy.select().where(
        di.ArchiveFileCopy.node == from_node,
        di.ArchiveFileCopy.has_file == "Y",
        ~(di.ArchiveFileCopy.file << files_at_dest),
    )

    # If the target option has been specified, only copy nodes also not
    # available there...
    if target is not None:

        # Fetch a reference to the target group
        try:
            target_group = di.StorageGroup.get(name=target)
        except pw.DoesNotExist:
            raise RuntimeError('Target group "%s" does not exist in the DB.' %
                               target)

        # First get the nodes at the destination...
        nodes_at_target = di.StorageNode.select().where(
            di.StorageNode.group == target_group)

        # Then use this to get a list of all files at the destination...
        files_at_target = (di.ArchiveFile.select().join(
            di.ArchiveFileCopy).where(
                di.ArchiveFileCopy.node << nodes_at_target,
                di.ArchiveFileCopy.has_file == "Y",
            ))

        # Only match files that are also not available at the target
        copy = copy.where(~(di.ArchiveFileCopy.file << files_at_target))

    # In transport mode (DEPRECATED) we only move files that don't have an
    # archive copy elsewhere...
    if transport:
        import warnings

        warnings.warn(
            "Transport mode is deprecated. Try to use --target instead.")

        # Get list of other archive nodes
        other_archive_nodes = di.StorageNode.select().where(
            di.StorageNode.storage_type == "A", di.StorageNode.id != from_node)

        files_in_archive = (di.ArchiveFile.select().join(
            di.ArchiveFileCopy).where(
                di.ArchiveFileCopy.node << other_archive_nodes,
                di.ArchiveFileCopy.has_file == "Y",
            ))

        copy = copy.where(~(di.ArchiveFileCopy.file << files_in_archive))

    # Join onto ArchiveFile for later query parts
    copy = copy.join(di.ArchiveFile)

    # If requested, limit query to a specific acquisition...
    if acq is not None:

        # Fetch acq if specified
        try:
            acq = di.ArchiveAcq.get(name=acq)
        except pw.DoesNotExist:
            raise Exception('Acquisition "%s" does not exist in the DB.' % acq)

        # Restrict files to be in the acquisition
        copy = copy.where(di.ArchiveFile.acq == acq)

    if not copy.count():
        print("No files to copy from node %s." % (node_name))
        return

    # Show acquisitions based summary of files to be copied
    if show_acq:
        acqs = [c.file.acq.name for c in copy]

        import collections

        for acq, count in collections.Counter(acqs).items():
            print("%s [%i files]" % (acq, count))

    # Show all files to be copied
    if show_files:
        for c in copy:
            print("%s/%s" % (c.file.acq.name, c.file.name))

    size_bytes = copy.aggregate(pw.fn.Sum(di.ArchiveFile.size_b))
    size_gb = int(size_bytes) / 1073741824.0

    print(
        "Will request that %d files (%.1f GB) be copied from node %s to group %s."
        % (copy.count(), size_gb, node_name, group_name))

    if not (force or click.confirm("Do you want to proceed?")):
        print("Aborted.")
        return

    dtnow = datetime.datetime.now()

    # Perform update in a transaction to avoid any clobbering from concurrent updates
    with di.ArchiveFileCopyRequest._meta.database.atomic():

        # Get a list of all the file ids for the copies we should perform
        files_ids = [c.file_id for c in copy]

        # Get a list of all the file ids for exisiting requests
        requests = di.ArchiveFileCopyRequest.select().where(
            di.ArchiveFileCopyRequest.group_to == to_group,
            di.ArchiveFileCopyRequest.node_from == from_node,
        )
        req_file_ids = [req.file_id for req in requests]

        # Separate the files into ones that already have requests and ones that don't
        files_in = filter(lambda x: x in req_file_ids, files_ids)
        files_out = filter(lambda x: x not in req_file_ids, files_ids)

        sys.stdout.write(
            "Updating %i existing requests and inserting %i new ones.\n" %
            (len(files_in), len(files_out)))

        # Perform an update of all the existing copy requests
        if len(files_in) > 0:
            update = di.ArchiveFileCopyRequest.update(
                nice=nice,
                completed=False,
                cancelled=False,
                timestamp=dtnow,
                n_requests=di.ArchiveFileCopyRequest.n_requests + 1,
            )

            update = update.where(
                di.ArchiveFileCopyRequest.file << files_in,
                di.ArchiveFileCopyRequest.group_to == to_group,
                di.ArchiveFileCopyRequest.node_from == from_node,
            )
            update.execute()

        # Insert any new requests
        if len(files_out) > 0:

            # Construct a list of all the rows to insert
            insert = [{
                "file": fid,
                "node_from": from_node,
                "nice": 0,
                "group_to": to_group,
                "completed": False,
                "n_requests": 1,
                "timestamp": dtnow,
            } for fid in files_out]

            # Do a bulk insert of these new rows
            di.ArchiveFileCopyRequest.insert_many(insert).execute()
示例#7
0
def clean(node_name, days, force, now, target, acq):
    """Clean up NODE by marking older files as potentially removable.

    If --target is specified we will only remove files already available in the
    TARGET_GROUP. This is useful for cleaning out intermediate locations such as
    transport disks.

    Using the --days flag will only clean correlator and housekeeping
    files which have a timestamp associated with them. It will not
    touch other types. If no --days flag is given, all files will be
    considered for removal.
    """

    import peewee as pw

    di.connect_database(read_write=True)

    try:
        this_node = di.StorageNode.get(di.StorageNode.name == node_name)
    except pw.DoesNotExist:
        print("Specified node does not exist.")

    # Check to see if we are on an archive node
    if this_node.storage_type == "A":
        if force or click.confirm("DANGER: run clean on archive node?"):
            print("%s is an archive node. Forcing clean." % node_name)
        else:
            print("Cannot clean archive node %s without forcing." % node_name)
            return

    # Select FileCopys on this node.
    files = di.ArchiveFileCopy.select(di.ArchiveFileCopy.id).where(
        di.ArchiveFileCopy.node == this_node,
        di.ArchiveFileCopy.wants_file == "Y")

    # Limit to acquisition
    if acq is not None:
        try:
            acq = di.ArchiveAcq.get(name=acq)
        except pw.DoesNotExit:
            raise RuntimeError("Specified acquisition %s does not exist" % acq)

        files_in_acq = di.ArchiveFile.select().where(di.ArchiveFile.acq == acq)

        files = files.where(di.ArchiveFileCopy.file << files_in_acq)

    # If the target option has been specified, only clean files also available there...
    if target is not None:

        # Fetch a reference to the target group
        try:
            target_group = di.StorageGroup.get(name=target)
        except pw.DoesNotExist:
            raise RuntimeError('Target group "%s" does not exist in the DB.' %
                               target)

        # First get the nodes at the destination...
        nodes_at_target = di.StorageNode.select().where(
            di.StorageNode.group == target_group)

        # Then use this to get a list of all files at the destination...
        files_at_target = (di.ArchiveFile.select().join(
            di.ArchiveFileCopy).where(
                di.ArchiveFileCopy.node << nodes_at_target,
                di.ArchiveFileCopy.has_file == "Y",
            ))

        # Only match files that are also available at the target
        files = files.where(di.ArchiveFileCopy.file << files_at_target)

    # If --days has been set we need to restrict to files older than the given
    # time. This only works for a few particular file types
    if days is not None and days > 0:

        # Get the time for the oldest files to keep
        oldest = datetime.datetime.now() - datetime.timedelta(days)
        oldest_unix = ephemeris.ensure_unix(oldest)

        # List of filetypes we want to update, needs a human readable name and a
        # FileInfo table.
        filetypes = [["correlation", di.CorrFileInfo],
                     ["housekeeping", di.HKFileInfo]]

        file_ids = []

        # Iterate over file types for cleaning
        for name, infotable in filetypes:

            # Filter to fetch only ones with a start time older than `oldest`
            oldfiles = (files.join(di.ArchiveFile).join(infotable).where(
                infotable.start_time < oldest_unix))

            local_file_ids = list(oldfiles)

            # Get number of correlation files
            count = oldfiles.count()

            if count > 0:
                size_bytes = (di.ArchiveFileCopy.select().where(
                    di.ArchiveFileCopy.id << local_file_ids).join(
                        di.ArchiveFile).aggregate(
                            pw.fn.Sum(di.ArchiveFile.size_b)))

                size_gb = int(size_bytes) / 2**30.0

                print("Cleaning up %i %s files (%.1f GB) from %s " %
                      (count, name, size_gb, node_name))

                file_ids += local_file_ids

    # If days is not set, then just select all files that meet the requirements so far
    else:

        file_ids = list(files)
        count = files.count()

        if count > 0:
            size_bytes = (di.ArchiveFileCopy.select().where(
                di.ArchiveFileCopy.id << file_ids).join(
                    di.ArchiveFile).aggregate(pw.fn.Sum(
                        di.ArchiveFile.size_b)))

            size_gb = int(size_bytes) / 1073741824.0

            print("Cleaning up %i files (%.1f GB) from %s " %
                  (count, size_gb, node_name))

    # If there are any files to clean, ask for confirmation and the mark them in
    # the database for removal
    if len(file_ids) > 0:
        if force or click.confirm("  Are you sure?"):
            print("  Marking files for cleaning.")

            state = "N" if now else "M"

            update = di.ArchiveFileCopy.update(
                wants_file=state).where(di.ArchiveFileCopy.id << file_ids)

            n = update.execute()

            print("Marked %i files for cleaning" % n)

        else:
            print("  Cancelled")
    else:
        print("No files selected for cleaning on %s." % node_name)
示例#8
0
def verify(node_name, md5, fixdb, acq):
    """Verify the archive on NODE against the database."""

    import os

    try:
        this_node = di.StorageNode.get(di.StorageNode.name == node_name)
    except pw.DoesNotExist:
        print("Specified node does not exist.")
        return

    ## Use a complicated query with a tuples construct to fetch everything we
    ## need in a single query. This massively speeds up the whole process versus
    ## fetching all the FileCopy's then querying for Files and Acqs.
    lfiles = (di.ArchiveFile.select(
        di.ArchiveFile.name,
        di.ArchiveAcq.name,
        di.ArchiveFile.size_b,
        di.ArchiveFile.md5sum,
        di.ArchiveFileCopy.id,
    ).join(di.ArchiveAcq).switch(di.ArchiveFile).join(
        di.ArchiveFileCopy).where(di.ArchiveFileCopy.node == this_node,
                                  di.ArchiveFileCopy.has_file == "Y").tuples())

    missing_files = []
    corrupt_files = []

    missing_ids = []
    corrupt_ids = []

    nfiles = 0

    with click.progressbar(lfiles, label="Scanning files") as lfiles_iter:
        for filename, acqname, filesize, md5sum, fc_id in lfiles_iter:

            # Skip if not in specified acquisitions
            if len(acq) > 0 and acqname not in acq:
                continue

            nfiles += 1

            filepath = this_node.root + "/" + acqname + "/" + filename

            # Check if file is plain missing
            if not os.path.exists(filepath):
                missing_files.append(filepath)
                missing_ids.append(fc_id)
                continue

            if md5:
                file_md5 = di.md5sum_file(filepath)
                corrupt = file_md5 != md5sum
            else:
                corrupt = os.path.getsize(filepath) != filesize

            if corrupt:
                corrupt_files.append(filepath)
                corrupt_ids.append(fc_id)
                continue

    if len(missing_files) > 0:
        print()
        print("=== Missing files ===")
        for fname in missing_files:
            print(fname)

    if len(corrupt_files) > 0:
        print()
        print("=== Corrupt files ===")
        for fname in corrupt_files:
            print(fname)

    print()
    print("=== Summary ===")
    print("  %i total files" % nfiles)
    print("  %i missing files" % len(missing_files))
    print("  %i corrupt files" % len(corrupt_files))
    print()

    # Fix up the database by marking files as missing, and marking
    # corrupt files for verification by alpenhornd.
    if fixdb:

        # Make sure we connect RW
        di.connect_database(read_write=True)

        if (len(missing_files) > 0) and click.confirm("Fix missing files"):
            missing_count = (di.ArchiveFileCopy.update(has_file="N").where(
                di.ArchiveFileCopy.id << missing_ids).execute())
            print("  %i marked as missing" % missing_count)

        if (len(corrupt_files) > 0) and click.confirm("Fix corrupt files"):
            corrupt_count = (di.ArchiveFileCopy.update(has_file="M").where(
                di.ArchiveFileCopy.id << corrupt_ids).execute())
            print("  %i corrupt files marked for verification" % corrupt_count)
示例#9
0
def update_node_requests(node):
    """Process file copy requests onto this node."""

    global done_transport_this_cycle

    # Ensure we are not on an HPSS node
    if is_hpss_node(node):
        log.error("Cannot process HPSS node here.")
        return

    avail_gb = node.avail_gb

    # Skip if node is too full
    if avail_gb < (node.min_avail_gb + 10):
        log.info("Node %s is nearly full. Skip transfers." % node.name)
        return

    # Calculate the total archive size from the database
    size_query = (di.ArchiveFile.select(fn.Sum(di.ArchiveFile.size_b)).join(
        di.ArchiveFileCopy).where(di.ArchiveFileCopy.node == node,
                                  di.ArchiveFileCopy.has_file == "Y"))
    size = size_query.scalar(as_tuple=True)[0]
    current_size_gb = float(0.0 if size is None else size) / 2**30.0

    # Stop if the current archive size is bigger than the maximum (if set, i.e. > 0)
    if current_size_gb > node.max_total_gb and node.max_total_gb > 0.0:
        log.info(
            "Node %s has reached maximum size (current: %.1f GB, limit: %.1f GB)"
            % (node.name, current_size_gb, node.max_total_gb))
        return

    # ... OR if this is a transport node quit if the transport cycle is done.
    if node.storage_type == "T" and done_transport_this_cycle:
        log.info("Ignoring transport node %s" % node.name)
        return

    start_time = time.time()

    # Fetch requests to process from the database
    requests = di.ArchiveFileCopyRequest.select().where(
        ~di.ArchiveFileCopyRequest.completed,
        ~di.ArchiveFileCopyRequest.cancelled,
        di.ArchiveFileCopyRequest.group_to == node.group,
    )

    # Add in constraint that node_from cannot be an HPSS node
    requests = requests.join(
        di.StorageNode).where(di.StorageNode.address != "HPSS")

    for req in requests:

        # Only continue if the node is actually mounted
        if not req.node_from.mounted:
            continue

        # For transport disks we should only copy onto the transport
        # node if the from_node is local, this should prevent pointlessly
        # rsyncing across the network
        if node.storage_type == "T" and node.host != req.node_from.host:
            log.debug(
                "Skipping request for %s/%s from remote node [%s] onto local "
                "transport disks" %
                (req.file.acq.name, req.file.name, req.node_from.name))
            continue

        # Only proceed if the source file actually exists (and is not corrupted).
        try:
            di.ArchiveFileCopy.get(
                di.ArchiveFileCopy.file == req.file,
                di.ArchiveFileCopy.node == req.node_from,
                di.ArchiveFileCopy.has_file == "Y",
            )
        except pw.DoesNotExist:
            log.error(
                "Skipping request for %s/%s since it is not available on "
                'node "%s". [file_id=%i]' % (req.file.acq.name, req.file.name,
                                             req.node_from.name, req.file.id))
            continue

        # Only proceed if the destination file does not already exist.
        try:
            di.ArchiveFileCopy.get(
                di.ArchiveFileCopy.file == req.file,
                di.ArchiveFileCopy.node == node,
                di.ArchiveFileCopy.has_file == "Y",
            )
            log.info("Skipping request for %s/%s since it already exists on "
                     'this node ("%s"), and updating DB to reflect this.' %
                     (req.file.acq.name, req.file.name, node.name))
            di.ArchiveFileCopyRequest.update(completed=True).where(
                di.ArchiveFileCopyRequest.file == req.file).where(
                    di.ArchiveFileCopyRequest.group_to ==
                    node.group).execute()
            continue
        except pw.DoesNotExist:
            pass

        # Check that there is enough space available.
        if node.avail_gb * 2**30.0 < 2.0 * req.file.size_b:
            log.warning('Node "%s" is full: not adding datafile "%s/%s".' %
                        (node.name, req.file.acq.name, req.file.name))
            continue

        # Constuct the origin and destination paths.
        from_path = "%s/%s/%s" % (req.node_from.root, req.file.acq.name,
                                  req.file.name)
        if req.node_from.host != node.host:
            from_path = "%s@%s:%s" % (
                req.node_from.username,
                req.node_from.address,
                from_path,
            )

        to_path = "%s/%s/" % (node.root, req.file.acq.name)
        if not os.path.isdir(to_path):
            log.info('Creating directory "%s".' % to_path)
            os.mkdir(to_path)

        # Giddy up!
        log.info('Transferring file "%s/%s".' %
                 (req.file.acq.name, req.file.name))
        st = time.time()

        # Attempt to transfer the file. Each of the methods below needs to set a
        # return code `ret` and give an `md5sum` of the transferred file.

        # First we need to check if we are copying over the network
        if req.node_from.host != node.host:

            # First try bbcp which is a fast multistream transfer tool. bbcp can
            # calculate the md5 hash as it goes, so we'll do that to save doing
            # it at the end.
            if command_available("bbcp"):
                cmd = "bbcp -f -z --port 4200 -W 4M -s 16 -o -E md5= %s %s" % (
                    from_path,
                    to_path,
                )
                ret, stdout, stderr = run_command(cmd.split())

                # Attempt to parse STDERR for the md5 hash
                if ret == 0:
                    mo = re.search("md5 ([a-f0-9]{32})", stderr)
                    if mo is None:
                        log.error(
                            "BBCP transfer has gone awry. STDOUT: %s\n STDERR: %s"
                            % (stdout, stderr))
                        ret = -1
                    md5sum = mo.group(1)
                else:
                    md5sum = None

            # Next try rsync over ssh. We need to explicitly calculate the md5
            # hash after the fact
            elif command_available("rsync"):
                cmd = (
                    'rsync -z%s --rsync-path="ionice -c4 -n4 rsync" -e "ssh -q" %s %s'
                    % (RSYNC_FLAG, from_path, to_path))
                ret, stdout, stderr = run_command(cmd.split())

                md5sum = (di.md5sum_file("%s/%s" % (to_path, req.file.name))
                          if ret == 0 else None)

            # If we get here then we have no idea how to transfer the file...
            else:
                log.warn("No commands available to complete this transfer.")
                ret = -1

        # Okay, great we're just doing a local transfer.
        else:

            # First try to just hard link the file. This will only work if we
            # are on the same filesystem. As there's no actual copying it's
            # probably unecessary to calculate the md5 check sum, so we'll just
            # fake it.
            try:
                link_path = "%s/%s/%s" % (node.root, req.file.acq.name,
                                          req.file.name)

                # Check explicitly if link already exists as this and
                # being unable to link will both raise OSError and get
                # confused.
                if os.path.exists(link_path):
                    log.error("File %s already exists. Clean up manually." %
                              link_path)
                    ret = -1
                else:
                    os.link(from_path, link_path)
                    ret = 0
                    md5sum = (
                        req.file.md5sum
                    )  # As we're linking the md5sum can't change. Skip the check here...

            # If we couldn't just link the file, try copying it with rsync.
            except OSError:
                if command_available("rsync"):
                    cmd = "rsync -%s %s %s" % (RSYNC_FLAG, from_path, to_path)
                    ret, stdout, stderr = run_command(cmd.split())

                    md5sum = (di.md5sum_file("%s/%s" %
                                             (to_path, req.file.name))
                              if ret == 0 else None)
                else:
                    log.warn(
                        "No commands available to complete this transfer.")
                    ret = -1

        # Check the return code...
        if ret:
            # If the copy didn't work, then the remote file may be corrupted.
            log.error("Rsync failed. Marking source file suspect.")
            di.ArchiveFileCopy.update(has_file="M").where(
                di.ArchiveFileCopy.file == req.file,
                di.ArchiveFileCopy.node == req.node_from,
            ).execute()
            continue
        et = time.time()

        # Check integrity.
        if md5sum == req.file.md5sum:
            size_mb = req.file.size_b / 2**20.0
            trans_time = et - st
            rate = size_mb / trans_time
            log.info(
                "Pull complete (md5sum correct). Transferred %.1f MB in %i "
                "seconds [%.1f MB/s]" % (size_mb, int(trans_time), rate))

            # Update the FileCopy (if exists), or insert a new FileCopy
            try:
                done = False
                while not done:
                    try:
                        fcopy = (di.ArchiveFileCopy.select().where(
                            di.ArchiveFileCopy.file == req.file,
                            di.ArchiveFileCopy.node == node,
                        ).get())
                        fcopy.has_file = "Y"
                        fcopy.wants_file = "Y"
                        fcopy.save()
                        done = True
                    except pw.OperationalError:
                        log.error(
                            "MySQL connexion dropped. Will attempt to reconnect in "
                            "five seconds.")
                        time.sleep(5)
                        di.connect_database(True)
            except pw.DoesNotExist:
                di.ArchiveFileCopy.insert(file=req.file,
                                          node=node,
                                          has_file="Y",
                                          wants_file="Y").execute()

            # Mark any FileCopyRequest for this file as completed
            di.ArchiveFileCopyRequest.update(completed=True).where(
                di.ArchiveFileCopyRequest.file == req.file).where(
                    di.ArchiveFileCopyRequest.group_to ==
                    node.group).execute()

            if node.storage_type == "T":
                # This node is getting the transport king.
                done_transport_this_cycle = True

            # Update local estimate of available space
            avail_gb = avail_gb - req.file.size_b / 2**30.0

        else:
            log.error('Error with md5sum check: %s on node "%s", but %s on '
                      'this node, "%s".' %
                      (req.file.md5sum, req.node_from.name, md5sum, node.name))
            log.error('Removing file "%s/%s".' % (to_path, req.file.name))
            try:
                os.remove("%s/%s" % (to_path, req.file.name))
            except:
                log.error("Could not remove file.")

            # Since the md5sum failed, the remote file may be corrupted.
            log.error("Marking source file suspect.")
            di.ArchiveFileCopy.update(has_file="M").where(
                di.ArchiveFileCopy.file == req.file,
                di.ArchiveFileCopy.node == req.node_from,
            ).execute()

        if time.time() - start_time > max_time_per_node_operation:
            break  # Don't hog all the time.
示例#10
0
def _import_file(node, root, acq_name, file_name):
    """Import a file into the DB.

    This routine adds the following to the database, if they do not already exist
    (or might be corrupted).
    - The acquisition that the file is a part of.
    - Information on the acquisition, if it is of type "corr".
    - The file.
    - Information on the file, if it is of type "corr".
    - Indicates that the file exists on this node.
    """
    global import_done
    curr_done = True
    fullpath = "%s/%s/%s" % (root, acq_name, file_name)
    log.debug("Considering %s for import." % fullpath)

    # Skip the file if ch_master.py still has a lock on it.
    if os.path.isfile("%s/%s/.%s.lock" % (root, acq_name, file_name)):
        log.debug('Skipping "%s", which is locked by ch_master.py.' % fullpath)
        return

    # Parse the path
    try:
        ts, inst, atype = di.parse_acq_name(acq_name)
    except di.Validation:
        log.info("Skipping non-acquisition path %s." % acq_name)
        return

    if import_done is not None:
        i = bisect.bisect_left(import_done, fullpath)
        if i != len(import_done) and import_done[i] == fullpath:
            log.debug("Skipping already-registered file %s." % fullpath)
            return

    # Figure out which acquisition this is; add if necessary.
    try:
        acq = di.ArchiveAcq.get(di.ArchiveAcq.name == acq_name)
        log.debug('Acquisition "%s" already in DB. Skipping.' % acq_name)
    except pw.DoesNotExist:
        acq = add_acq(acq_name)
        if acq is None:
            return
        log.info('Acquisition "%s" added to DB.' % acq_name)

    # What kind of file do we have?
    ftype = di.detect_file_type(file_name)
    if ftype is None:
        log.info('Skipping unrecognised file "%s/%s".' % (acq_name, file_name))
        return

    # Make sure information about the acquisition exists in the DB.
    if atype == "corr" and ftype.name == "corr":
        if not acq.corrinfos.count():
            try:
                di.CorrAcqInfo.create(
                    acq=acq, **get_acqcorrinfo_keywords_from_h5(fullpath))
                log.info(
                    'Added information for correlator acquisition "%s" to '
                    "DB." % acq_name)
            except:
                log.warning('Missing info for acquistion "%s": HDF5 datasets '
                            "empty. Leaving fields NULL." % (acq_name))
                di.CorrAcqInfo.create(acq=acq)
    elif atype == "hk" and ftype.name == "hk":
        try:
            keywords = get_acqhkinfo_keywords_from_h5("%s/%s" %
                                                      (root, acq_name))
        except:
            log.warning("Could no open atmel_id.dat file. Skipping.")
            keywords = []
        for kw in keywords:
            if not sum(1 for _ in di.HKAcqInfo.select().where(
                    di.HKAcqInfo.acq == acq).where(
                        di.HKAcqInfo.atmel_name == kw["atmel_name"])):
                try:
                    di.HKAcqInfo.create(acq=acq, **kw)
                    log.info(
                        'Added information for housekeeping acquisition "%s", '
                        "board %s to DB." % (acq_name, kw["atmel_name"]))
                except:
                    log.warning(
                        'Missing info for acquisition "%s": atmel_id.dat '
                        "file missing or corrupt. Skipping this acquisition." %
                        acq_name)
                    return
    elif atype == "rawadc":
        if not acq.rawadcinfos.count():
            di.RawadcAcqInfo.create(
                acq=acq, **get_acqrawadcinfo_keywords_from_h5(acq_name))
            log.info('Added information for raw ADC acquisition "%s" to '
                     "DB." % acq_name)
    elif atype == "raw":
        try:
            raw_info = acq.rawinfos.get()
        except pw.DoesNotExist:
            try:
                raw_info = di.RawAcqInfo.create(
                    acq=acq,
                    **get_rawinfo_keywords("%s/%s" % (root, acq_name)))
                log.info('Added information for raw acquisition "%s" to DB.' %
                         acq_name)
            except:
                log.info('Missing info in settings.dat for acquisition "%s". '
                         "Skipping this acquisition." % acq_name)
                return

    # Add the file, if necessary.
    try:
        file = di.ArchiveFile.get(di.ArchiveFile.name == file_name,
                                  di.ArchiveFile.acq == acq)
        size_b = file.size_b
        log.debug('File "%s/%s" already in DB. Skipping.' %
                  (acq_name, file_name))
    except pw.DoesNotExist:
        log.debug("Computing md5sum.")
        md5sum = di.md5sum_file(fullpath, cmd_line=True)
        size_b = os.path.getsize(fullpath)
        done = False
        while not done:
            try:
                file = di.ArchiveFile.create(acq=acq,
                                             type=ftype,
                                             name=file_name,
                                             size_b=size_b,
                                             md5sum=md5sum)
                done = True
            except pw.OperationalError:
                log.error(
                    "MySQL connexion dropped. Will attempt to reconnect in "
                    "five seconds.")
                time.sleep(5)
                di.connect_database(True)
        log.info('File "%s/%s" added to DB.' % (acq_name, file_name))

    # Register the copy of the file here on the collection server, if (1) it does
    # not exist, or (2) it does exist but has been labelled as corrupt. If (2),
    # check again.
    if not file.copies.where(di.ArchiveFileCopy.node == node).count():
        copy = di.ArchiveFileCopy.create(file=file,
                                         node=node,
                                         has_file="Y",
                                         wants_file="Y")
        log.info('Registered file copy "%s/%s" to DB.' % (acq_name, file_name))

    # Make sure information about the file exists in the DB.
    if ftype.name == "corr":
        # Add if (1) there is no corrinfo or (2) the corrinfo is missing.
        if not file.corrinfos.count():
            try:
                di.CorrFileInfo.create(
                    file=file, **get_filecorrinfo_keywords_from_h5(fullpath))
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
            except:
                if not file.corrinfos.count():
                    di.CorrFileInfo.create(file=file)
                log.warning('Missing info for file "%s/%s": HDF5 datasets '
                            "empty or unreadable. Leaving fields NULL." %
                            (acq_name, file_name))
        elif not file.corrinfos[0].start_time:
            try:
                i = file.corrinfos[0]
                k = get_filecorrinfo_keywords_from_h5(fullpath)
            except:
                log.debug('Still missing info for file "%s/%s".')
            else:
                i.start_time = k["start_time"]
                i.finish_time = k["finish_time"]
                i.chunk_number = k["chunk_number"]
                i.freq_number = k["freq_number"]
                i.save()
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
    if ftype.name == "hk":
        # Add if (1) there is no hkinfo or (2) the hkinfo is missing.
        if not file.hkinfos.count():
            try:
                di.HKFileInfo.create(
                    file=file, **get_filehkinfo_keywords_from_h5(fullpath))
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
            except:
                if not file.corrinfos.count():
                    di.HKFileInfo.create(file=file)
                log.warning('Missing info for file "%s/%s": HDF5 datasets '
                            "empty or unreadable. Leaving fields NULL." %
                            (acq_name, file_name))
        elif not file.hkinfos[0].start_time:
            try:
                i = file.hkinfos[0]
                k = get_filehkinfo_keywords_from_h5(fullpath)
            except:
                log.debug('Still missing info for file "%s/%s".')
            else:
                i.start_time = k["start_time"]
                i.finish_time = k["finish_time"]
                i.atmel_name = k["atmel_name"]
                i.chunk_number = k["chunk_number"]
                i.save()
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
    if ftype.name == "weather":
        # Add if (1) there is no weatherinfo or (2) the weatherinfo is missing.
        if not file.weatherinfos.count():
            #      try:
            di.WeatherFileInfo.create(
                file=file, **get_fileweatherinfo_keywords_from_h5(fullpath))
            log.info('Added information for file "%s/%s" to DB.' %
                     (acq_name, file_name))
        #      except:
        #        if not file.corrinfos.count():
        #          di.WeatherFileInfo.create(file=file)
        #        log.warning("Missing info for file \"%s/%s\": HDF5 datasets " \
        #                    "empty or unreadable. Leaving fields NULL." %
        #                    (acq_name, file_name))
        elif not file.weatherinfos[0].start_time:
            try:
                i = file.weatherinfos[0]
                k = get_fileweatherinfo_keywords_from_h5(fullpath)
            except:
                log.debug('Still missing info for file "%s/%s".')
            else:
                i.start_time = k["start_time"]
                i.finish_time = k["finish_time"]
                i.date = k["date"]
                i.save()
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
    if ftype.name == "raw":
        # Add if (1) there is no rawinfo or (2) the rawinfo is missing.
        if not file.rawinfos.count():
            try:
                di.RawFileInfo.create(file=file,
                                      **get_filerawinfo_keywords(
                                          raw_info, size_b, file_name))
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))
            except:
                if not file.corrinfos.count():
                    di.RawFileInfo.create(file=file)
                log.warning(
                    'Missing info for file "%s/%s". Leaving fields NULL.' %
                    (acq_name, file_name))
        elif not file.rawinfos[0].start_time:
            try:
                i = file.rawinfos[0]
                k = get_filerawinfo_keywords(raw_info, size_b, file_name)
            except:
                log.debug('Still missing info for file "%s/%s".')
            else:
                i.start_time = k["start_time"]
                i.chunk_number = k["chunk_number"]
                i.save()
                log.info('Added information for file "%s/%s" to DB.' %
                         (acq_name, file_name))

    if import_done is not None:
        bisect.insort_left(import_done, fullpath)
        with open(LOCAL_IMPORT_RECORD, "w") as fp:
            fp.write("\n".join(import_done))
示例#11
0
 def setUp(self):
     try:
         di.connect_database(read_write=False)
     except:
         raise unittest.SkipTest("Skipping test as couldn't connect to db.")
示例#12
0
"""Call backs for the HPSS interface.
"""

import click
import peewee as pw
from ch_util import data_index as di

from . import logger  # Import logger here to avoid connection

# messages for transfer

# Get a reference to the log
log = logger.get_log()

# Reconnect to the database read/write
di.connect_database(read_write=True)


@click.group()
def cli():
    """Call back commands for updating the database from a shell script after an
    HPSS transfer."""


@cli.command()
@click.argument("file_id", type=int)
@click.argument("node_id", type=int)
def push_failed(file_id, node_id):
    """Update the database to reflect that the HPSS transfer failed.

    INTERNAL COMMAND. NOT FOR HUMAN USE!