示例#1
0
    def tearDown(self):
        if not self.stage:
            return

        if not os.path.exists(self.stage):
            return

        f = Files({})
        if not f.is_dir_empty(self.stage):
            f.empty_dir(self.stage)

        os.rmdir(self.stage)
示例#2
0
    def tearDown(self):
        f = Files({})
        dirs = [self.stage, self.source, self.alt['source']['empty'], self.default['stage']]
        for i in dirs:
            if not i:
                continue

            if not os.path.exists(i):
                continue

            if not f.is_dir_empty(i):
                f.empty_dir(i)
            os.rmdir(i)
示例#3
0
def validate_config(config, command):
    f = Files(config)
    if command == "auto":
        temp = f.db_load_config()
    else:
        temp = f.load_config()
    config = temp if temp else config

    config['begin'], config['end'] = check_dates(config)
    config['site'], config['instrument'], config['facility'], config['datastream'] = check_sif_datastream(config)
    config['source'] = check_source(config)
    config['stage'] = check_stage(config)
    config['job'] = check_job(config)

    return config
示例#4
0
def ask_for_dir(message, default=None, error=None, required=False, level=1):
    """ Ask user for a directory location """
    f = Files({})
    folder = input(message)
    if folder == '':
        if default != None:
            folder = default
        elif required:
            if level < max_tries:
                folder = ask_for_dir(message, default=default, error=error, required=required, level=level+1)
            else:
                exit(error)
        else:
            return

    folder = abspath(f.clean_path(folder))
    return folder
示例#5
0
def check_stage(config, level=1):
    f = Files(config)
    stage = config['stage']
    interactive = config['interactive']
    quiet = config['quiet']

    if stage == None:
        reproc_home = os.environ.get('REPROC_HOME')
        home = os.environ.get('HOME')
        if reproc_home != None:
            stage = reproc_home
        elif home.split('/')[1] == 'data':
            stage = '{}/reprocessing/data'.format(home)
        else:
            stage = '/data/home/{}/reprocessing/data'.format(config['username'])
    if interactive:
        message = "Specify a stage directory: ({})".format(stage)
        error = 'A stage directory must be specified. Please try again.'
        default = stage
        stage = ask_for_dir(message, error=error, default=default, required=True)

    if not os.path.exists(stage):
        try:
            os.makedirs(stage)
        except:
            if level < max_tries and not quiet:
                print('Unable to create {}'.format(stage))
                message = 'Please specify a new location: '
                error = 'A stage directory must be specified. Please try again.'
                config['stage'] = ask_for_dir(message, error=error, required=True)
                stage = check_stage(config, level+1)
            else:
                exit('Unable to create {}. Please try again.'.format(stage))

    if not f.is_dir_writable(stage):
        if level < max_tries and not quiet:
            print('{} is not writable.'.format(stage))
            message = 'Please specify a new location: '
            error = 'A stage directory must be specified. Please try again.'
            config['stage'] = ask_for_dir(message, error=error, required=True)
            stage = check_stage(config, level+1)
        else:
            exit('{} is not writable. Please try again.'.format(stage))

    return stage
示例#6
0
def check_job(config):
    job = config['job']
    interactive = config['interactive']

    if interactive:
        message = 'Please specify a job name: '
        if job:
            message = '{0}({1})'.format(message, job)
        temp = input(message)
        if temp != '':
            job = temp

    if not job:
        import uuid
        uid = str(uuid.uuid1())
        uid = uid.split('-')
        job = '{0}{1}'.format(uid[0], uid[3])

    config['job'] = job
    f = Files(config)
    f.setup_job_dir()

    return job
示例#7
0
def main():
    if not len(sys.argv) > 1:
        sys.argv.append("-h")

    if '-v' in sys.argv:
        print(apm.__version__)
        return

    # Retrieve arguments from user
    config = parse_args()
    command = config['command'].lower()

    # Check to see if this is a test
    if command == 'test':
        test_config = test.config()

        sys.argv = [sys.argv[0]]
        jprint(config, sort_keys=True, indent=4)
        unittest.main(buffer=True)
        # unittest.main()
        return

    # Not a test
    if command == 'info' or command == 'vapinfo':
        vap = VapMgr({})
        vap.vap_info()
        return

    if command == 'auto':
        temp = validate_config(config, command)
        config = temp if temp else config
    else:
        # Validate user arguments
        temp = validate_config(config, command)
        config = temp if temp else config

    if command == 'check':
        jprint(config, sort_keys=True, indent=4)
        return

    # Save the config to file
    s = time.time()
    f = Files(config)
    f.save_config()
    f.load_filenames()
    files = f.files

    # Check to see if any files are not currently being tracked
    # Or if any tracked files have been deleted
    print("Checking status of tracked files...", end="")
    sys.stdout.flush()

    json_file = '{0}/{1}/{1}.json'.format(config['stage'], config['job'])
    if os.path.exists(json_file) and config['ingest']:
        fp = open(json_file, 'r')
        files = json.loads(fp.read())
        fp.close()

        cwd = os.getcwd()
        os.chdir('{}/{}/collection'.format(config['stage'], config['job']))

        keys = files.keys()

        sites = set(os.listdir('.'))
        for site in keys:
            if site not in sites:
                files.pop(site)
                continue

            os.chdir(site)

            instruments = set(os.listdir('.'))
            ins_keys = files[site].keys()

            for ins in ins_keys:
                if ins not in instruments:
                    files[site].pop(ins)
                    continue

                os.chdir(ins)

                filelist = set(os.listdir('.'))
                for i in filelist:
                    if i not in files[site][ins] and not (os.path.isdir(i) and i == "other_files"):
                        exit("\nThe file {0}/{1}/{2} is currently untracked.\nPlease edit {3}.json to start tracking this file.\n".format(site, ins, i, config['job']))

                for i in files[site][ins]:
                    if i not in filelist:
                        files[site][ins][i]["deleted"] = True

                os.chdir('..')

            os.chdir('..')

        os.chdir(cwd)
    print("Done") # Done checking status of tracked files
    sys.stdout.flush()

    # Run the appropriate command
    if command == 'auto':
        print('Attempting to stage files for datastreams: {}'.format(config['datastream']))

        skip = False

        if not config['duplicates']:
            s = Stage(config, files)
            config, files = s.run()

            if config['exit']:
                exit()

            if config['duplicates']:
                skip = True

        if not skip and not config['vap']:
            r = Rename(config, files)
            config, files = r.run()
            if config['exit']:
                exit()
        exit()

    elif command == 'stage':
        print("*"*50,"\n", json.dumps(config, indent=2), "*"*50, "\n")
        skip = False

        if not config['duplicates']:
            s = Stage(config, files)
            config, files = s.run()

            if config['exit']:
                exit()

            if config['duplicates']:
                skip = True

        if not skip and not config['vap']:
            r = Rename(config, files)
            config, files = r.run()
            if config['exit']:
                exit()

    elif command == 'rename':
        # If rename is called explicitly, force rename even if config is set to false
        switch = True if config['rename'] == False else False

        if switch:
            config['rename'] = True

        if not config['vap']:
            r = Rename(config, files)
            config, files = r.run()
            if config['exit']:
                exit()

        if switch:
            config['rename'] = False

    elif command == 'process':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            p = Process(config, files)
            config, files = p.run()
            if config['exit']:
                exit()

    elif command == 'review':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            r = Review(config, files)
            config, files = r.run()
            if config['exit']:
                exit()

    elif command == 'remove':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            r = Remove(config, files)
            config, files = r.run()
            if config['exit']:
                exit()

    elif command == 'archive':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            a = Archive(config, files)
            config, files = a.run()
            if config['exit']:
                exit()

    elif command == 'cleanup':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            c = Cleanup(config, files)
            config, files = c.run()
            if config['exit']:
                exit()

    elif command == 'prep':
        r = Rename(config, files)
        has_coll = r.check_for_collisions()
        files = r.files

        if has_coll:
            config = r.config
            files = r.files
        else:
            d = Demo(config, files)
            config, files = d.run()
            if config['exit']:
                exit()

    elif command == "notification":
        # Alka's module goes here
        print('Yay notify the user shit has changed.')

    else:
        sys.argv.append("-h")
        config = parse_args()

    f.config = config
    f.files = files
    f.save_config()
    f.save_filenames()
示例#8
0
def parse_args():
    """ Setup argument parsing and parse the arguments """
    username = os.environ.get('USER')

    # Setup parser and groups
    parser = argparse.ArgumentParser(description='ARM Processing Manager')
    ui_flags = parser.add_mutually_exclusive_group()
    stage_type = parser.add_mutually_exclusive_group()

    # Setup positional arguments
    parser.add_argument('command', help='Which of the APM stages to run: stage, rename, process, review, remove, archive, cleanup')

    # Demo options
    parser.add_argument('--demo', help='Prep for different stages of a demo, available options include: remove, archive, cleanup')

    # Date
    parser.add_argument('-b', '--begin', type=int, default=0, help='Format: YYYYMMDD - date to start processing data')
    parser.add_argument('-e', '--end', type=int, default=0, help='Format:YYYYMMDD - date to stop processing data')

    # SIF/Datastreams
    parser.add_argument('-s', '--site', help='The site the data is from')
    parser.add_argument('-i', '--instrument', help='The instrument used to collect the data')
    parser.add_argument('-f', '--facility', help='The facility where the instrument is located')
    parser.add_argument('-d', '--datastream', nargs='+', help='One or more datastream patterns. "%%" and "*" can be used as wildcards.')

    # Job
    parser.add_argument('-j', '--job', required=True, help='DQR # for job')

    # Alias
    parser.add_argument('-a', '--alias', help='An alias for the Ingest to use to connect to the database. Def: apm')

    # Flow control flags
    parser.add_argument('--stage', help='Specify a staging directory')
    parser.add_argument('--source', help='Specify a source directory')
    parser.add_argument('--no-rename', action='store_false', help='Do not strip the ARM prefix from the files')
    parser.add_argument('--no-db-up', action='store_false', help='Do not update the config database')
    parser.add_argument('--no-compare', action='store_false', help='Do not compare the ingest output for re-archiving')

    # Other
    parser.add_argument('--ingest-flags', nargs='+', help='Flags you want APM to pass to the INGEST. Ex. --ingest-flags F (Do not use "-F" APM will add the "-") (Will apply to all ingests if running for multiple datastreams)')

    # Ingest Vs Vap
    stage_type.add_argument('--ingest', action='store_true', help='Ingest vs. VAP (default)')
    stage_type.add_argument('--vap', action='store_true', help='VAP vs. Ingest')

    # UI Flags
    ui_flags.add_argument('-I', '--interactive', action='store_true', help='Prompt for various inputs')
    ui_flags.add_argument('-q', '--quiet', action='store_true', help='Suppresses prompts and exits gracefully if unable to run')
    ui_flags.add_argument('-D', '--devel', action='store_true', help='Run APM in development mode')


    # Parse the args
    arguments = parser.parse_args()

    if (arguments.ingest == False) and (arguments.vap == False):
        arguments.ingest = True

    args = {
        'command': arguments.command,
        'demo': arguments.demo,
        'begin': arguments.begin,
        'end': arguments.end,
        'site': arguments.site,
        'instrument': arguments.instrument,
        'facility': arguments.facility,
        'datastream': arguments.datastream,
        'duplicates': False,
        'job': arguments.job,
        'alias': arguments.alias,
        'stage': arguments.stage,
        'source': arguments.source,
        'rename': arguments.no_rename,
        'db_up': arguments.no_db_up,
        'compare': arguments.no_compare,
        'iflags': arguments.ingest_flags,
        'ingest': arguments.ingest,
        'vap': arguments.vap,
        'interactive': arguments.interactive,
        'quiet': arguments.quiet,
        'devel': arguments.devel,
        'username': username,
        'exit': False,
        "cleanup_status": {
                "review": {
                    "status": True,
                },
                "remove": {
                    "status": False,
                    "deletion_list": False,
                    "archive_list": False,
                    "files_bundled": False,
                },
                "archive": {
                    "status": False,
                    "files_deleted": False,
                    "move_files": False,
                    "files_released": False,
                },
                "cleanup": {
                    "status": False,
                    "files_archived": False,
                    "files_cleaned_up": False,
                },
            }
        }

    f = Files(args)

    if args['stage'] != None:
        temp = f.clean_path(args['stage'])
        args['stage'] = abspath(temp)
    if args['source'] != None:
        temp = f.clean_path(args['source'])
        args['source'] = abspath(temp)

    return args
示例#9
0
    def run(self):
        """ Run the archive portion of the cleanup phase """
        if not self.config['cleanup_status']['remove']['status']:
            print(self.config['cleanup_status']['remove']['status'])
            print('')
            print(
                "Data files must be requested for deletion before the files can be archived."
            )
            self.config['exit'] = True
            return self.config, self.files

        # Setup vars
        stage = self.config['stage']
        job = self.config['job']

        ############################################################
        # Check to see if the current user is `dsmgr`
        ############################################################
        # Verify current user is authenticated to run this command
        if not self.authenticate():
            self.config['exit'] = True
            return self.config, self.files

        # Do this if the files have not yet been verified as deleted from the archive
        if not self.config['cleanup_status']['archive']['files_deleted']:
            print("Verifying all files have been deleted from the archive...",
                  end="")
            ############################################################
            # Update the local archive database
            ############################################################
            # Setup the datastreams to update
            datastreams = []
            datastream_path = dir_pattern(3).format(stage, job, 'datastream')
            for site in os.listdir(datastream_path):
                path = dir_pattern().format(datastream_path, site)
                for folder in os.listdir(path):
                    abs_folder = dir_pattern().format(path, folder)
                    if os.path.isdir(
                            abs_folder) and not os.path.islink(abs_folder):
                        datastreams.append(folder)

            # Update the local copy of the archive db
            if not DEVEL:
                update_archive(datastreams)

            ############################################################
            # Load the list of files to be removed from the archive
            ############################################################
            deleted_files = []
            deletion_file = dir_pattern(3).format(stage, job,
                                                  "%s.deletion-list.txt" % job)
            if not os.path.exists(deletion_file):
                print("Failed")
                print(
                    "Deletion list does not exist. Please create it and try again."
                )
                self.config['exit'] = True
                return self.config, self.files

            fp = open(deletion_file, 'r')
            deletion_text = fp.readlines()
            fp.close()

            for line in deletion_text:
                if line.endswith("\r\n"):
                    line = line[:-2]

                tar = {}
                parts, tar['md5'] = line.split(' ')
                parts = parts.split('.')
                tar['version'] = parts[-1][1:]
                tar['name'] = '.'.join(parts[:-1])
                deleted_files.append(tar)

                del tar, parts

            if 'line' in locals():
                del line

            ############################################################
            # Verify all files have been removed from the archive
            ############################################################
            # Get a list of files that are currently at the archive
            archive_files = {}
            db_file = '/apps/ds/conf/datainv/.db_connect'
            alias = 'inv_read'

            db = DB(self.config, db_file=db_file, alias=alias)

            # Store the query
            query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;"

            # List the column names so the values can be mapped in a dictionary
            cols = [
                'file_tag', 'file_name', 'file_version', 'file_size',
                'file_stored', 'file_md5', 'file_stamp', 'file_checked',
                'file_active'
            ]

            # convert the start and end dates to a unix timestamp
            start = convert_date_to_timestamp(self.config['begin'])
            end = convert_date_to_timestamp(self.config['end'])

            # Query the database for each of the datastreams
            for k, v in enumerate(datastreams):
                args = (v, start, end)
                result = db.query(query % args, columns=cols)

                if len(result) > 0:
                    archive_files[v] = result
                else:
                    print("Failed")
                    print("No results for %s" % v)

            # Store the list of what is currently in the archive and their versions to file
            current_archive = dir_pattern(3).format(stage, job,
                                                    'current_archive.json')
            fp = open(current_archive, 'w')
            fp.write(
                json.dumps(archive_files,
                           indent=2,
                           sort_keys=False,
                           separators=(',', ': ')))
            fp.close()
            del fp

            if DEVEL:
                file_path = dir_pattern(3).format(stage, job,
                                                  '%s.archive.json' % job)
                if os.path.exists(file_path):
                    fp = open(file_path, 'r')
                    archive_files = json.loads(fp.read())
                    fp.close()

                    del fp, file_path

            # Check to see if any of the "deleted_files" are in the list
            # If yes, quit
            # If no, proceed
            all_files_deleted = None

            if len(deleted_files) > 0:
                # Check the list of files from the archive to see if the current file has been deleted
                for f in deleted_files:
                    process = '.'.join(f['name'].split('.')[0:2])
                    name = f['name']

                    if any(d['file_name'] == name
                           for d in archive_files[process]):
                        all_files_deleted = False
                        print("Failed")
                        print(
                            "Not all files have been deleted from the archive."
                        )
                        print("Please try again later.")
                        self.config['exit'] = True
                        return self.config, self.files

                else:
                    all_files_deleted = True

            else:
                all_files_deleted = True

            if 'f' in locals():
                del f
            if 'process' in locals():
                del process

            if all_files_deleted != True:
                print("Failed")
                print("Not all files have been removed from the archive.")
                print(
                    "Run this again once all files have been removed from the archive."
                )
                self.config['exit'] = True
                return self.config, self.files

            # Files have been deleted
            self.config['cleanup_status']['archive']['files_deleted'] = True
            print("Done")

        ############################################################
        # Move any files not being archived to subdirectories
        #
        # Processed files:
        # This includes any processed files outside the
        # 	date range specified
        # Raw/Tar files:
        # This includes any files that do not need to be rearchived
        ############################################################
        if not self.config['cleanup_status']['archive']['move_files']:
            print("Moving files that should not be archived...", end="")

            cwd = os.getcwd()
            datastream = dir_pattern(3).format(stage, job, 'datastream')

            # Load the list of tar files that need to be archived
            os.chdir(dir_pattern().format(stage, job))
            fp = open('archive.json', 'r')
            contents = json.loads(fp.read())
            fp.close()
            tar_archive = {}
            for k, v in enumerate(contents):
                s = v['site']
                p = v['instrument']
                if s not in tar_archive:
                    tar_archive[s] = {}
                if p not in tar_archive[s]:
                    tar_archive[s][p] = []

                tar_archive[s][p].append(v['file_name'])

            if len(contents) > 0:
                del s, p, k, v

            os.chdir(datastream)
            sites = os.listdir(datastream)
            for i, s in enumerate(sites):
                os.chdir(s)
                processes = os.listdir('.')
                for j, p in enumerate(processes):
                    no_archive = dir_pattern(4).format(datastream, s, p,
                                                       'no_archive')
                    os.chdir(p)

                    if p.split('.')[-1] == '00':
                        # This is a raw datastream
                        # Don't include directories

                        # Get a list of non-tar files from the raw datastreams
                        # Move all of these files to a sub-directory
                        rawfiles = [
                            x for x in os.listdir('.') if not x.endswith('tar')
                            if not os.path.isdir(x)
                        ]

                        # Get a list of all tar files from the raw datastreams
                        # Retrieve the list of tar files that need to be archived
                        # Move all of the files not in the list to a sub-directory
                        tarfiles = [
                            x for x in glob("*.tar") if not os.path.isdir(x)
                        ]

                        for x in rawfiles:
                            if not os.path.exists(no_archive):
                                os.mkdir(no_archive)
                            elif not os.path.isdir(no_archive):
                                print("Failed")
                                print(
                                    "There is a file called 'no_archive' in %s."
                                )
                                print(
                                    "This file must be removed before proceeding."
                                )
                                self.config['exit'] = True
                                return self.config, self.files

                            src = dir_pattern(4).format(datastream, s, p, x)
                            try:
                                os.rename(src, no_archive)
                            except OSError:
                                shutil.move(src, no_archive)

                        for x in tarfiles:
                            if not os.path.exists(no_archive):
                                os.mkdir(no_archive)
                            elif not os.path.isdir(no_archive):
                                print("Failed")
                                print(
                                    "There is a file called 'no_archive' in %s."
                                )
                                print(
                                    "This file must be removed before proceeding."
                                )
                                self.config['exit'] = True
                                return self.config, self.files

                            if s not in tar_archive or p not in tar_archive[
                                    s] or x not in tar_archive[s][p]:
                                src = dir_pattern(4).format(
                                    datastream, s, p, x)
                                try:
                                    os.rename(src, no_archive)
                                except OSError:
                                    shutil.move(src, no_archive)

                    else:
                        # For each processed datastream
                        # Get a list of all the files
                        # Move any files that fall outside the specified date range to a sub-directory
                        if not os.path.exists(no_archive):
                            os.mkdir(no_archive)
                        elif not os.path.isdir(no_archive):
                            print("Failed")
                            print("There is a file called 'no_archive' in %s.")
                            print(
                                "This file must be removed before proceeding.")
                            self.config['exit'] = True
                            return self.config, self.files

                        # Don't include directories
                        files = [
                            x for x in os.listdir('.') if not os.path.isdir(x)
                        ]

                        timeformat = "%Y%m%d"
                        begin = datetime.strptime(str(self.config['begin']),
                                                  timeformat)
                        end = datetime.strptime(str(self.config['end']),
                                                timeformat)

                        for x in files:
                            date = x.split('.')[2]
                            filedate = datetime.strptime(date, timeformat)

                            if not (filedate >= begin and filedate <= end):
                                src = dir_pattern(4).format(
                                    datastream, s, p, x)
                                try:
                                    os.rename(src, no_archive)
                                except OSError:
                                    shutil.move(src, no_archive)

                    os.chdir('..')
                os.chdir('..')
            os.chdir(cwd)

            print("Done")
            self.config['cleanup_status']['archive']['move_files'] = True
        ############################################################
        # Read environment variables
        ############################################################
        print("Updating environment variables...", end="")

        env_path = dir_pattern().format(stage, job)

        if not update_env(env_path):
            f = Files(self.config)
            shell = f.get_shell()
            if shell == "bash":
                ext = 'sh'
            else:
                ext = 'csh'

            print("Failed")
            exit("Error: Unable to locate env.%s." % ext)

        print("Done")  # Updating Env Vars

        ############################################################
        # Ensure `DBCONNECT_PATH` does not point to job `.db_connect` file
        ############################################################
        if 'DBCONNECT_PATH' in os.environ:
            del os.environ['DBCONNECT_PATH']

        # The command should be complete up to this point,
        # however I'm waiting on a response to verify the exact name
        # of this environment variable

        ############################################################
        # Run `release_data`
        ############################################################
        print("Running release_data...", end="")

        #############################################
        # Need to change this so it supports both
        #  `sif` data and `datastream` data
        #############################################
        db = DB(self.config)

        data_paths = db.get_data_paths()

        commands = []

        for d in data_paths:
            output = d['output']
            (site, temp) = output.split('/')
            temp = temp.split('.')[0][3:]
            for i, e in reversed(list(enumerate(temp))):
                if not is_number(e):
                    fac = i
                    break
            else:
                print("Could not separate facility from %s" % temp)
                self.config['exit'] = True
                return self.config, self.files

            facility = temp[fac:]
            process = temp[:fac]
            command = ['release_data', '-s', site, '-f', facility, process]
            # Check to see if a plugin needs to modify the command
            command = self.manager.callPluginCommand(
                'hook_release_data_command_alter', command)
            commands.append(command)

        # code to run a shell command copied from other part of APM
        # Needs modified to work here

        # Run the command
        for command in commands:
            try:
                if not DEVEL:
                    ps = Popen(command, stdout=PIPE, stderr=PIPE)
                    ps.communicate()
                    returncode = ps.returncode
                    if returncode != 0:
                        print("Failed")
                        self.config['exit'] = True
                        return self.config, self.files
            except CalledProcessError as e:
                print("Failed")
                self.config['exit'] = True
                return self.config, self.files
            except Exception as e:
                raise e

        print("Done")

        # Files have been released
        self.config['cleanup_status']['archive']['files_released'] = True

        # Archive is complete
        self.config['cleanup_status']['archive']['status'] = True

        return self.config, self.files