Пример #1
0
    def run(self):
        manager = PluginManager()
        config = self.config
        f = Files(config)
        cwd = os.getcwd()
        stage = config['stage']
        collection = dir_pattern(3).format(stage, config['job'], 'collection')

        # Make sure files are supposed to be renamed
        if config['rename'] == False:
            return config, self.files

        # Verify there are no file collisions
        if self.check_for_collisions():
            return config, self.files

        # Strip the ARM prefix from all of the files
        print("\nStripping ARM prefix from files... ", end="")
        sys.stdout.flush()

        manager.callPluginCommand('hook_rename_preprocess', {'config': config})

        os.chdir(collection)
        sites = set(os.listdir('.'))
        for site in sites:
            os.chdir(site)
            instruments = set(os.listdir('.'))
            for ins in instruments:
                os.chdir(ins)
                files = set(os.listdir('.'))
                for i in files:
                    new_name = f.rename_file(i)
                    if new_name != None:
                        if i != new_name:
                            self.files[site][ins][new_name] = self.files[site][
                                ins][i]
                            self.files[site][ins].pop(i)

                        self.files[site][ins][new_name][
                            'current_name'] = new_name
                        self.files[site][ins][new_name][
                            'stripped_name'] = new_name

                os.chdir('..')

            os.chdir('..')

        manager.callPluginCommand('hook_renamed_files_alter',
                                  {'config': config})

        print("Done\n")
        sys.stdout.flush()

        return config, self.files
Пример #2
0
    def check_for_collisions(self):
        """ Check all unpacked files for file naming collisions """
        print("Checking for file naming collisions...", end="")
        sys.stdout.flush()

        config = self.config
        f = Files(config, self.files)
        cwd = os.getcwd()
        collection = dir_pattern(3).format(config['stage'], config['job'],
                                           'collection')
        os.chdir(collection)

        sites = os.listdir('.')
        for site in sites:
            os.chdir(site)
            instruments = set(os.listdir('.'))
            for ins in instruments:
                os.chdir(ins)
                files = set(os.listdir('.'))
                names = self.files[site][ins]

                # Mark files as deleted
                for k, v in names.items():
                    if k not in files:
                        names[k]['deleted'] = True

                # Check for duplicates
                for k, v in names.items():
                    if len(v['duplicate_files']) > 0 and v['deleted'] == False:
                        for i in v['duplicate_files']:
                            name = f.get_file_by_uuid(i)
                            if names[name]['uuid'] == i and names[name][
                                    'deleted'] == False:
                                config['duplicates'] = True
                                print("Fail")
                                print(
                                    "Files with naming collisions still exist.\nPlease resolve these issues before continuing.\n"
                                )
                                return True

                os.chdir('..')

            os.chdir('..')

        os.chdir(cwd)
        config['duplicates'] = False
        print("Done")
        sys.stdout.flush()
        return False
Пример #3
0
    def run(self):
        """ Run the cleanup portion of the cleanup phase """
        if self.config['cleanup_status']['archive']['status'] != True:
            print("Data files must be archived before they can be cleaned up.")
            self.config['exit'] = True
            return self.config, self.files

        stage = self.config['stage']
        job = self.config['job']

        ################################################################################
        # Update local archive database
        ################################################################################
        if not self.config['cleanup_status']['cleanup']['files_archived']:
            print("Updating local copy of the archive...", end="")
            # Setup the datastreams to update
            datastreams = []
            datastream_path = dir_pattern(3).format(stage, job, 'datastream')
            for site in os.listdir(datastream_path):
                path = dir_pattern().format(datastream_path, site)
                for folder in os.listdir(path):
                    abs_folder = dir_pattern().format(path, folder)
                    if os.path.isdir(
                            abs_folder) and not os.path.islink(abs_folder):
                        datastreams.append(folder)

            # Update the local copy of the archive db
            if not DEVEL:
                update_archive(datastreams)

            print("Done")
            ################################################################################
            # Verify that all files to be added to the archive, were added
            ################################################################################
            print(
                "Verifying processed and bundled files have been archived...",
                end="")
            cwd = os.getcwd()

            archive_files = {}
            db_file = '/apps/ds/conf/datainv/.db_connect'
            alias = 'inv_read'

            if not os.path.exists(db_file):
                print("Failed")
                print(
                    "Unable to connect to the archive database. Please try again later."
                )
                self.config['exit'] = True
                return self.config, self.files

            db = DB(self.config, db_file=db_file, alias=alias)

            # Store the query
            query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;"

            # List the column names so the values can be mapped in a dictionary
            cols = [
                'file_tag', 'file_name', 'file_version', 'file_size',
                'file_stored', 'file_md5', 'file_stamp', 'file_checked',
                'file_active'
            ]

            # convert the start and end dates to a unix timestamp
            start = convert_date_to_timestamp(self.config['begin'])
            end = convert_date_to_timestamp(self.config['end'])

            archive_file = dir_pattern(3).format(stage, job,
                                                 'current_archive.json')
            fp = open(archive_file, 'r')
            oArch = json.loads(fp.read())
            fp.close()
            del fp

            os.chdir(datastream_path)
            for site in os.listdir('.'):
                path = dir_pattern().format(datastream_path, site)
                os.chdir(site)

                for folder in os.listdir('.'):
                    os.chdir(folder)

                    args = (folder, start, end)
                    result = db.query(query % args, columns=cols)

                    for f in os.listdir('.'):
                        if not os.path.isdir(dir_pattern().format(
                                os.getcwd(), f)):
                            try:
                                new_version = next(d['file_version']
                                                   for d in result
                                                   if d['file_name'] == f)
                                old_version = next(o['file_version']
                                                   for o in oArch[folder]
                                                   if o['file_name'] == f)
                                if not new_version > old_version:
                                    print("Failed")
                                    print(
                                        "Not all files have been successfully archived. Please try again later."
                                    )
                                    self.config['exit'] = True
                                    return self.config, self.files
                            except StopIteration:
                                pass

                    os.chdir('..')
                os.chdir('..')

            os.chdir(cwd)
            self.config['cleanup_status']['cleanup']['files_archived'] = True
            print("Done")

        ################################################################################
        # Remove all files from `<job>/datastream`
        ################################################################################
        if not self.config['cleanup_status']['cleanup']['files_cleaned_up']:
            print("Cleaning up project files...", end="")
            # Remove archive.json
            # Remove current_archive.json
            # Remove <job>.deletion-list.txt

            f = Files(self.config)
            path = dir_pattern().format(stage, job)
            delete = [
                "datastream",
                "collection",
                "file_comparison/raw",
                "file_comparison/tar",
                'archive.json',
                'current_archive.json',
                '%s.deletion-list.txt' % job,
            ]

            try:
                for i in delete:
                    item = dir_pattern().format(path, i)
                    if os.path.exists(item):
                        if os.path.isdir(item):
                            f.empty_dir(item)
                        elif os.path.isfile(item):
                            os.remove(item)

            except:
                print("Failed")
                print(
                    "Unable to cleanup all files. Please try again, or cleanup project manually."
                )
                self.config['exit'] = True
                return self.config, self.files

            print("Done")
            self.config['cleanup_status']['cleanup']['files_cleaned_up'] = True

        self.config['cleanup_status']['cleanup']['status'] = True
        return self.config, self.files
Пример #4
0
    def run(self):
        """ Run the remove portion of the cleanup phase """
        self.start_time = datetime.now()
        if not self.config['cleanup_status']['review']['status']:
            print(
                "\nData must be reviewed before it can be removed from the archive."
            )
            self.config['exit'] = True
            return self.config, self.files

        stage = self.config['stage']
        job = self.config['job']

        del_file = '%s.deletion-list.txt' % job
        job_folder = dir_pattern().format(stage, job)

        exists = False
        replace = False

        # Check to see if deletion file exists
        if os.path.exists(dir_pattern().format(job_folder, del_file)):
            exists = True
            ui = UI()
            replace = ui.yn_choice(
                '%s already exists.\n Would you like to overwrite this file?' %
                del_file, 'n')

        if exists and not replace:
            return self.config, self.files

        # Either file doesn't exist or user has chosen to overwrite it
        # Create <job>.deletion-list.txt file

        # Reset statuses for this run
        for k in self.config['cleanup_status']['remove']:
            self.config['cleanup_status']['remove'][k] = False

        contents = []

        ##################################################
        # Get list of files from datastream folder
        ##################################################
        datastreams = []
        datastream_path = dir_pattern(3).format(stage, job, 'datastream')
        for site in os.listdir(datastream_path):
            path = dir_pattern().format(datastream_path, site)
            for folder in os.listdir(path):
                abs_folder = dir_pattern().format(path, folder)
                if os.path.isdir(
                        abs_folder) and not os.path.islink(abs_folder):
                    datastreams.append(folder)

        # Processed files
        p_files = {}
        for k, v in enumerate(datastreams):
            if v not in p_files:
                p_files[v] = []

            p_files[v] = os.listdir(
                dir_pattern(3).format(datastream_path, site, v))

        ##################################################
        # Update the local copy of the archive db
        ##################################################
        # print("\nUpdating list of files stored at the archive..."
        # if not DEVEL:
        # 	update_archive(datastreams)
        # print("Done"
        ##################################################
        # Get list of files from archive db
        ##################################################
        print("\nRetrieving list of relevant files stored at the archive...",
              end="")
        # Connect to the database
        archive_files = {}
        db_file = '/apps/ds/conf/datainv/.db_connect'
        alias = 'inv_read'

        if not os.path.exists(db_file):
            print(
                "\nUnable to connect to the archive database. Please try again later."
            )
            self.config['exit'] = True
            return self.config, self.files

        db = DB(self.config, db_file=db_file, alias=alias)

        # Store the query
        query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;"

        # List the column names so the values can be mapped in a dictionary
        cols = [
            'file_tag', 'file_name', 'file_version', 'file_size',
            'file_stored', 'file_md5', 'file_stamp', 'file_checked',
            'file_active'
        ]

        # convert the start and end dates to a unix timestamp
        start = convert_date_to_timestamp(self.config['begin'])
        end = convert_date_to_timestamp(self.config['end'])

        # Query the database for each of the datastreams
        for k, v in enumerate(datastreams):
            args = (v, start, end)
            result = db.query(query % args, columns=cols)

            if len(result) > 0:
                archive_files[v] = result
            else:
                print("\nNo results for %s" % v)

        # Unset loop variables
        if len(datastreams) > 0:
            del k, v, args, result

        print("Done")

        print("Map original tar bundle structure...", end="")
        self.maps['orig']['tar'] = self.get_tar_structure(
            dir_pattern(3).format(stage, job, "file_comparison/tar"))
        print("Done")

        if self.config['ingest']:
            # Add files to the list that should be removed from the archive
            print("\nGenerating list of files to remove from the archive...")
            sys.stdout.flush()
            ##################################################
            # Compare raw files to see if they changed
            ##################################################

            # Setup Variables for the following code to use
            file_history = self.files  # List of files as they have traveled from tar file through the ingest. Mapped by their current name

            raw_streams = [
            ]  # The datastreams that contain the raw files (ex. sgpmfrsrC1.00)

            # Setup the paths for the ingested and untarred raw files
            new_folder = dir_pattern(3).format(stage, job, 'datastream')
            old_folder = dir_pattern(3).format(stage, job,
                                               'file_comparison/raw')

            raw_files = {
            }  # container to hold a mapping of raw files in the <job>/datastream folder
            archive_tars = {
            }  # Container to hold a list of tar files at the archive

            bundle_data = False  # Does the raw data in "datastream" need to be bundled

            # Get a list of the sites in "datastream"
            for site in os.listdir(new_folder):
                raw_files[site] = {}

            # Establish a structure for the raw files in "datastream"
            #  This structure follows the same pattern as 'file_history'
            for site in raw_files:
                for instrument in glob(
                        dir_pattern(3).format(new_folder, site, '*.00')):
                    instrument = instrument.split('/')[-1]
                    raw_files[site][instrument] = {}
                    raw_streams.append(instrument)
                    for f in os.listdir(
                            dir_pattern(3).format(new_folder, site,
                                                  instrument)):
                        raw_files[site][instrument][f] = {}

            # Compare all of the existing files
            #  By comparing existing files instead of files that were unpacked
            #  we make sure to include all files and can check for files that are not being tracked
            # 		(This should never happen)
            c = Files(self.config)
            for i, s in raw_files.items():  # i = key, s = site
                for j, p in s.items():  # j = key, p = process/instrument
                    pbar = UI()
                    percent = 0
                    pbar.progress(percent)
                    count = len(p)
                    l = 1
                    for k, f in p.items():  # k = key, f = file
                        # Compare the file in 'datastream' with its counterpart in 'file_comparison/raw'
                        if k not in file_history[i][
                                j]:  # This if statement should never evaluate "True"
                            # File is not being tracked
                            # Raw files in datastream need to be rebundled
                            bundle_data = True

                            # Tar file with this raw file needs to be added to the archive
                            # Make sure the site is in the dict
                            if i not in self.archive['add']['raw']:
                                self.archive['add']['raw'][i] = {j: {}}

                            # Make sure the process is in the dict
                            if j not in self.archive['add']['raw'][i]:
                                self.archive['add']['raw'][i][j] = {}

                            # Add the file to the dict
                            self.archive['add']['raw'][i][j][k] = {}

                            continue  # Go to the next iteration of the loop (file cannot be compared because there is no counterpart)

                        # Compare the ingested raw file with the unpacked raw file
                        file_path = dir_pattern(5).format(
                            stage, job, '%s', i, j)
                        file_1 = dir_pattern().format(file_path % 'datastream',
                                                      k)
                        file_2 = dir_pattern().format(
                            file_path % 'file_comparison/raw',
                            file_history[i][j][k]['original_name'])
                        if not c.is_same_file(file_1, file_2):
                            # The files are not the same. Raw files in datastream need to be rebundled
                            bundle_data = True

                            # Ensure self.archive['remove']['raw'] has the proper structure
                            if i not in self.archive['remove']['raw']:
                                self.archive['remove']['raw'][i] = {j: []}

                            if j not in self.archive['remove']['raw'][i]:
                                self.archive['remove']['raw'][i][j] = []

                            self.archive['remove']['raw'][i][j].append(k)

                            # Make self.archive['remove']['raw'][i][j] a unique list
                            self.archive['remove']['raw'][i][j] = list(
                                set(self.archive['remove']['raw'][i][j]))

                        percent = int((float(l) / float(count)) * 100)
                        pbar.progress(percent)
                        l = l + 1

                    percent = int((float(l) / float(count)) * 100)
                    pbar.progress(percent)
                    print("")
                    sys.stdout.flush()

            # Unset loop variables
            if len(raw_files) > 0:
                del i, j, k, s, p, f, c

            if bundle_data:
                # Fill self.maps['orig']['history'] and bundle the data
                for site in file_history:
                    if site not in self.maps['orig']['history']:
                        self.maps['orig']['history'][site] = {}

                    for process in file_history[site]:
                        if process not in self.maps['orig']['history'][site]:
                            self.maps['orig']['history'][site][process] = {}

                        for f, d in file_history[site][process].items():
                            if d['original_name'] not in self.maps['orig'][
                                    'history'][site][process]:
                                self.maps['orig']['history'][site][process][
                                    d['original_name']] = d

                # Find any orig/bad files and copy them over (correcting names as necessary)
                other_files_path = dir_pattern(3).format(
                    stage, job, 'file_comparison/raw/%s/%s/%s')
                for i, s in self.maps['orig']['history'].items():
                    for j, p in s.items():
                        bad_files = glob(other_files_path % (i, j, '*.bad.*'))
                        orig_files = glob(other_files_path %
                                          (i, j, '*.orig.*'))
                        edit_files = glob(other_files_path %
                                          (i, j, '*.edit*.*'))

                        # if len(orig_files) > 0:
                        # 	pbar = UI()
                        # 	count = len(orig_files)
                        # 	pbar.progress(0)

                        for k, of in enumerate(orig_files):
                            oFile = of.split('/')[-1]
                            if oFile in p:
                                key = oFile.replace('orig', 'raw')
                                if key in p:
                                    filename = p[key]['current_name'].replace(
                                        '.raw.', '.orig.')
                                    filename = dir_pattern(6).format(
                                        stage, job, 'datastream', i, j,
                                        filename)
                                    shutil.copy(of, filename)

                            del k, of, oFile, key

                        # print(""
                        # sys.stdout.flush()

                        # if len(bad_files) > 0:
                        # 	pbar = UI()
                        # 	count = len(bad_files)
                        # 	pbar.progress(0)
                        for k, bf in enumerate(bad_files):
                            bFile = bf.split('/')[-1]
                            if bFile in p:
                                key = bFile.replace('bad', 'raw')

                                if key in p:
                                    filename = p[key]['current_name'].replace(
                                        '.raw.', '.bad.')
                                else:
                                    filename = bFile

                                filename = dir_pattern(6).format(
                                    stage, job, 'datastream', i, j, filename)
                                shutil.copy(bf, filename)

                            # # Update progress bar
                            # pbar.progress(int((float(k + 1) / float(count)) * 100))

                            del k, bf, bFile, key

                        # print(""
                        # sys.stdout.flush()

                        # if len(edit_files) > 0:
                        # 	pbar = UI()
                        # 	count = len(edit_files)
                        # 	pbar.progress(0)
                        for k, ef in enumerate(edit_files):
                            eFile = ef.split('/')[-1]
                            temp = eFile.split('.')
                            edit = None
                            for t in temp:
                                if temp[t].startswith('edit'):
                                    edit = temp[t]
                                    break

                            if eFile in p:
                                key = eFile.replace(edit, 'raw')
                                if key in p:
                                    filename = p[key]['current_name'].replace(
                                        '.raw.', ".%s." % edit)
                                    filename = dir_pattern(6).format(
                                        stage, job, 'datastream', i, j,
                                        filename)
                                    shutil.copy(ef, filename)

                            # # Update progress bar
                            # pbar.progress(int((float(k + 1) / float(count)) * 100))

                            del k, ef, eFile, edit, t, key

                        # print(""
                        # sys.stdout.flush()

                        del j, p
                    del i, s

                # Create any needed orig files
                print("Create needed orig files...")
                sys.stdout.flush()

                for i, s in self.archive['remove']['raw'].items():
                    for j, p in s.items():
                        path = dir_pattern(5).format(stage, job, "datastream",
                                                     i, j)
                        k = 0
                        count = len(p)
                        for f in p:
                            orig = f.replace('.raw.', '.orig.')
                            if not os.path.exists(dir_pattern().format(
                                    path, orig)):
                                src = dir_pattern(6).format(
                                    stage, job, "file_comparison/raw", i, j,
                                    file_history[i][j][f]['unpacked_name'])
                                dst = dir_pattern().format(path, orig)
                                shutil.copy(src, dst)
                                # del src, dst
                            percent = int((float(k) / float(count)) * 100)
                            pbar.progress(percent)
                            k = k + 1

                        if percent < 100:
                            percent = int((float(k) / float(count)) * 100)
                            pbar.progress(percent)
                        print("")

                    # Unset loop variables
                    # del i, s, j, p, path, f, orig, src, dst

                print("Done")

                # Bundle the data
                self.bundle_raw_data(raw_streams)
                self.config['cleanup_status']['remove']['files_bundled'] = True

                print("Map new tar bundle structure...", end="")
                self.maps['new']['tar'] = self.get_tar_structure(
                    dir_pattern(3).format(stage, job, "datastream"))
                print("Done")

                print("")
                print("Mapping raw structure from original tar files...",
                      end="")
                self.maps['orig']['raw'] = self.map_raw_structure(
                    self.maps['orig']['tar'])
                print("Done")

                print("Mapping raw structure from new tar files...", end="")
                self.maps['new']['raw'] = self.map_raw_structure(
                    self.maps['new']['tar'])
                print("Done")

                ##################################################
                # Find all of the tar files that need
                #   to be removed from the archive
                ##################################################
                print("")
                print(
                    "Generating list of tar files to be removed from the archive..."
                )
                sys.stdout.flush()

                # Find all of the tar files that need to be removed from the archive
                for i, s in self.archive['remove']['raw'].items():
                    percent = 0
                    for j, p in s.items():
                        pbar = UI()
                        count = len(p)
                        pbar.progress(percent)
                        k = 1
                        for raw_file in p:
                            tar_files = self.find_original_tar_bundle(
                                file_history[i][j][raw_file]['original_name'],
                                i, j)
                            for f in tar_files:
                                if f not in self.archive['remove']['tar']:
                                    tar = {
                                        'site': i,
                                        'instrument': j,
                                        'file_name': f
                                    }
                                    self.archive['remove']['tar'].append(tar)
                            percent = int((float(k) / float(count)) * 100)
                            pbar.progress(percent)
                            k = k + 1

                        if percent == 99:
                            pbar.progress(100)
                        print("")
                        sys.stdout.flush()

                # Unset loop variables
                if len(self.archive['remove']['raw']) > 0:
                    del i, s, j, p, raw_file, tar_files, f, tar

                print("Done")

                ##################################################
                # Find all of the tar files that need
                #   to be added to the archive
                ##################################################
                print("")
                print(
                    "Generating list of tar files to be added to the archive..."
                )
                pbar = UI()
                pbar.progress(0)
                count = len(self.archive['remove']['tar'])
                percent = 0
                i = 1

                # Find all of the tar files that need to be added to the archive
                for tar_file in self.archive['remove']['tar']:
                    files = self.find_all_files_from_original_tar(
                        tar_file['file_name'], tar_file['site'],
                        tar_file['instrument'])
                    for f in files:
                        temp = f
                        if not any(d['file_name'] == temp
                                   for d in self.archive['add']['tar']):
                            tar = {
                                'site': tar_file['site'],
                                'instrument': tar_file['instrument'],
                                'file_name': f
                            }

                            self.archive['add']['tar'].append(tar)
                    percent = int((float(i) / float(count)) * 100)
                    pbar.progress(percent)
                    i = i + 1

                if percent == 99:
                    pbar.progress(100)
                print("")
                sys.stdout.flush()

                # Unset loop variables
                if len(self.archive['remove']['tar']) > 0:
                    del tar_file, files, f

                for i, s in self.archive['add']['raw'].items():
                    for j, p in s.items():
                        pbar = UI()
                        pbar.progress(0)
                        percent = 0
                        count = len(p)
                        i = 1
                        for raw_file, info in p.items():
                            tar_files = self.find_original_tar_bundle(
                                raw_file, i, j)
                            for f in tar_files:
                                temp = f
                                if not any(
                                        d['file_name'] == temp
                                        for d in self.archive['add']['tar']):
                                    tar = {
                                        'site': i,
                                        'instrument': j,
                                        'file_name': f
                                    }
                                    self.archive['add']['tar'].append(tar)
                            percent = int((float(i) / float(count)) * 100)
                            pbar.progress(percent)
                            i = i + 1

                        if percent == 99:
                            pbar.progress(100)
                        print("")
                        sys.stdout.flush()

                # Unset loop variables
                if len(self.archive['add']['raw']) > 0:
                    del i, s, j, p, raw_file, info, tar_files

                    if 'f' in locals():
                        del f
                    if 'tar' in locals():
                        del tar

                ##################################################
                # Update archive db for raw datastream
                ##################################################
                if not DEVEL:
                    update_archive(raw_streams)

                # Get list of tar files from the archive
                for k, v in enumerate(raw_streams):
                    stream = dir_pattern(5).format(stage, job,
                                                   'file_comparison/tar', site,
                                                   v)
                    files = os.listdir(stream)
                    files = "','".join(files)
                    args = (v, files)
                    query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_active = true and file_name in ('%s')"
                    result = db.query(query % args, columns=cols)

                    if len(result) > 0:
                        archive_tars[v] = result
                    else:
                        print("\nNo results for %s" % v)

                # Unset loop variables
                del k, v, args, result

                print("Done generating tar file list")

                # Find data on tar files in list and add it to 'contents'
                print("")
                print("Adding tar files to deletion list...", end="")

                for f in self.archive['remove']['tar']:
                    files = archive_tars[f['instrument']]
                    for k, v in enumerate(files):
                        if v['file_name'] == f['file_name']:
                            index = k
                            break
                    else:
                        print("\nUnable to find %s in archive db" %
                              f['file_name'])
                        self.config['exit'] = True
                        return self.config, self.files

                    temp = f['file_name']
                    if not any(d['filename'] == temp for d in contents):
                        contents.append({
                            'datastream': f['instrument'],
                            'filename': f['file_name'],
                            'hash': files[index]['file_md5'],
                            'version': files[index]['file_version']
                        })

                if len(self.archive['remove']['tar']) > 0:
                    del f, files, k, v, index
                    pass

                print("Done")

        # Set proper file names in deletion list
        print("Setting proper file names in deletion list...", end="")
        for k, v in archive_files.items():
            if k.split('.')[-1] != '00':
                for key, f in enumerate(v):
                    if f['file_name'] not in p_files[k]:
                        temp = f['file_name']
                        pass
                        if not any(d['filename'] == temp for d in contents):
                            contents.append({
                                'datastream': k,
                                'filename': f['file_name'],
                                'hash': f['file_md5'],
                                'version': f['file_version']
                            })

        print("Done")

        # Store the list of files that need to be archived to file
        archive_json_file = dir_pattern(3).format(stage, job, 'archive.json')
        fp = open(archive_json_file, 'w')
        fp.write(
            json.dumps(self.archive['add']['tar'],
                       indent=2,
                       sort_keys=False,
                       separators=(',', ': ')))
        fp.close()
        del fp

        # Update the saved status
        self.config['cleanup_status']['remove']['archive_list'] = True

        ##################################################
        # Write the results to file
        # (Use '\r\n' for Windows line endings)
        ##################################################
        print("\nEmailing deletion list...", end="")
        sys.stdout.flush()
        file_contents = []

        contents = sorted(contents, key=self.get_sort_key)

        for line in contents:
            l = "%s.v%s %s" % (line['filename'], line['version'], line['hash'])
            file_contents.append(l)

        fp = open(dir_pattern().format(job_folder, del_file), 'w')
        fp.write("\r\n".join(file_contents))
        fp.close()
        del fp

        # Update the saved status
        self.config['cleanup_status']['remove']['deletion_list'] = True

        # Send the deletion list to the appropriate place (currently email, may be upload at a later time)
        self.email_del_list("%s.deletion-list.txt" % self.config['job'])
        # self.upload_del_list()

        print("Done")

        # Update the saved status
        self.config['cleanup_status']['remove']['status'] = True

        duration = datetime.now() - self.start_time
        print(duration)

        return self.config, self.files
Пример #5
0
    def run(self):
        config = self.config
        manager = self.manager

        if config['ingest']:
            # If staging for Ingest

            # Make sure collection does not have any files that might get overwritten
            empty = self.check_collection_empty()
            if not empty:
                print(
                    "\nFiles currently exist in your collection directory.\nPlease empty {}/{}/collection and try again.\n"
                    .format(config['stage'], config['job']))
                config['exit'] = True
                return config, self.files

            # cd to the stage directory
            os.chdir(config['stage'])

            # Check to see if a plugin needs to modify the datastream
            temp = manager.callPluginCommand('hook_datastream_alter',
                                             {'config': config})
            config = temp if temp != None else config

            # Check to see if a plugin needs to modify the SIF data
            temp = manager.callPluginCommand('hook_sif_alter',
                                             {'config': config})
            config = temp if temp != None else config

            # Establish a database connection
            db = DB(config)

            # Get the data_paths
            data_paths = db.get_data_paths()

            # Check to see if a plugin needs to modify the data_paths
            temp = manager.callPluginCommand('hook_data_paths_alter', {
                'config': config,
                'data_paths': data_paths
            })
            data_paths = temp if temp != None else data_paths

            # for each instrument
            for k, v in enumerate(data_paths):
                archive_path = v['output']
                stage_path = v['input']

                # Set tar_path and check for plugin modifications
                tar_path = '{}/{}'.format(config['source'], archive_path)
                temp = manager.callPluginCommand('hook_tar_path_alter', {
                    'config': config,
                    'tar_path': tar_path
                })
                tar_path = temp if temp != None else tar_path

                if os.path.exists(tar_path):
                    # Get a list of tar files that match specified dates
                    tar = UnPack(config, archive_path, stage_path)
                    tar_files = tar.get_tar_files()

                    temp = manager.callPluginCommand('hook_tar_files_alter',
                                                     {'config': config})
                    tar_files = temp if temp != None else tar_files

                    if tar_files and len(tar_files) > 0:
                        # compare_path = '{}/{}/.compare/{}'.format(config['stage'], config['job'], stage_path)
                        compare_path = dir_pattern(5).format(
                            config['stage'], config['job'], 'file_comparison',
                            'raw', stage_path)
                        tar_backup = dir_pattern(5).format(
                            config['stage'], config['job'], 'file_comparison',
                            'tar', stage_path)
                        collection_path = '{}/{}/collection/{}'.format(
                            config['stage'], config['job'], stage_path)

                        # Make the above paths if they don't already exist
                        if not os.path.exists(compare_path):
                            os.makedirs(compare_path)

                        if not os.path.exists(tar_backup):
                            os.makedirs(tar_backup)

                        if not os.path.exists(collection_path):
                            os.makedirs(collection_path)

                        # Copy the tar files to the backup location
                        if not tar.copy_files(tar_files, tar_backup):
                            print("Unable to copy tar files")

                        # Unpack the tar files
                        tar.extract_tar_files(tar_files)
                        has_dups = tar.handle_duplicate_files()
                        if has_dups:
                            config['duplicates'] = True

                            for i in has_dups:
                                duplicates[i] = has_dups[i]

                    else:
                        temp = tar_path.split('/')
                        if not config['quiet']:
                            print(
                                '\nData not available for {} using the dates specified'
                                .format(temp[-1]))

                else:
                    temp = tar_path.split('/')
                    if not config['quiet']:
                        print('\nData for {} does not exist.'.format(temp[-1]))

                site, process = stage_path.split('/')

                if self.files == None:
                    self.files = {}

                if site not in self.files:
                    self.files[site] = {}

                site = self.files[site]
                if process not in site:
                    site[process] = {}

                process = site[process]

                if os.path.exists(
                        dir_pattern(4).format(self.config['stage'],
                                              self.config['job'], 'collection',
                                              stage_path)):
                    files = os.listdir(
                        dir_pattern(4).format(self.config['stage'],
                                              self.config['job'], 'collection',
                                              stage_path))
                    dup_uuid = {}
                    for i in files:
                        original_name = i
                        temp = i.split('.')
                        if temp[-1][0] == 'v':
                            try:
                                int(temp[-1][1:])
                                original_name = '.'.join(temp[:-1])
                            except:
                                pass

                        process[i] = {
                            "uuid": str(uuid.uuid4()),
                            "current_name": i,
                            "original_name": original_name,
                            "stripped_name": None,
                            "processed_name": None,
                            "unpacked_name": i,
                            "duplicate_files": [],
                            "deleted": False,
                        }
                        if original_name != i:
                            dup_uuid[i] = process[i]['uuid']

                    for i in duplicates:
                        if i.startswith(data_paths[k]['input']):
                            for j in duplicates[i]:
                                site, process, name = j.split('/')
                                for l in duplicates[i]:
                                    temp = l.split('/')
                                    if j != l:
                                        self.files[site][process][name][
                                            'duplicate_files'].append(
                                                dup_uuid[temp[2]])

                    # Copy the config files from /data/conf to /<stage>/<job>/conf
                    conf_path = "/data/conf/{0}/{0}{1}{2}".format(
                        self.config['site'], self.config['instrument'],
                        self.config['facility'])
                    conf_dest = "{0}/{1}/conf/{2}".format(
                        self.config['stage'], self.config['job'],
                        self.config['site'])
                    dest_folder = "{}{}{}".format(self.config['site'],
                                                  self.config['instrument'],
                                                  self.config['facility'])
                    if not os.path.exists(conf_path):
                        conf_path = "/data/conf/{0}/{1}{2}".format(
                            self.config['site'], self.config['instrument'],
                            self.config['facility'])
                        conf_dest = "{0}/{1}/conf/{2}".format(
                            self.config['stage'], self.config['job'],
                            self.config['site'])
                        dest_folder = "{}{}".format(self.config['instrument'],
                                                    self.config['facility'])

                    if os.path.exists(conf_path):
                        if not os.path.exists(conf_dest):
                            os.makedirs(conf_dest)

                        if os.path.exists(dir_pattern().format(
                                conf_dest, dest_folder)):
                            try:
                                os.rmdir(dir_pattern().format(
                                    conf_dest, dest_folder))
                            except OSError as e:
                                if e.errno == errno.ENOTEMPTY:
                                    exit(
                                        "Unable to copy config files to {}. Destination is not empty."
                                        .format(dir_pattern().format(
                                            conf_dest, dest_folder)))
                                else:
                                    raise e

                        shutil.copytree(
                            conf_path,
                            dir_pattern().format(conf_dest, dest_folder))

            f = Files(self.config)
            src = dir_pattern(3).format(config['stage'], config['job'],
                                        'collection')
            # dst = dir_pattern(3).format(config['stage'], config['job'], '.compare')
            dst = dir_pattern(4).format(config['stage'], config['job'],
                                        'file_comparison', 'raw')
            if os.path.exists(dst):
                f.empty_dir(dst)
                os.rmdir(dst)

            shutil.copytree(src, dst)

            if len(duplicates) > 0:
                print('')
                print(
                    'The following files had naming collisions when unpacked.\nPlease verify the contents and keep only the appropriate file(s).'
                )
                print(
                    'Please do not rename files, simply delete any unwanted files.'
                )
                for i in duplicates:
                    print('')
                    for j in duplicates[i]:
                        print(j)
                print('')

            f.save_env()

        elif config['vap']:
            f = Files(self.config)
            f.save_env()

            vap = VapMgr(self.config)
            vap.add_to_env()

        return config, self.files
Пример #6
0
    def run(self):
        """ Unpack the tar file """
        # Setup Vars
        st_files = self.tar.st_files
        file_names = self.tar.file_names
        # files = self.tar.members
        # temp = self.config['']

        files = []
        for i in range(len(st_files)):
            files.append([])

        # Open the tar file
        tar = tarfile.open(
            dir_pattern().format(self.tar.archive_path, self.file), 'r')

        # Get the content of the tar file and check for duplicate file names
        members = tar.getmembers()

        f = Files(self.config)

        # Iterate over each tar file
        for i, m in enumerate(members):

            # Make sure arrays are not 0 length
            if len(file_names) == 0:
                file_names.append([])
            if len(files) == 0:
                files.append([])
            if len(st_files) == 0:
                st_files.append([])

            # Iterate over each entry in file_names
            # Add the file name to the correct array
            for k, v in enumerate(file_names):
                sf_names = st_files[k]
                sn = f.strip_name(m.name)
                if sn == None or sn == 'orig' or sn == 'bad':
                    sn = m.name

                if not (m.name in v or sn in sf_names):
                    file_names[k].append(m.name)
                    files[k].append(m)
                    st_files[k].append(sn)
                    break

            else:
                file_names.append([m.name])
                files.append([m])
                st_files.append([sn])

        duplicates = {}
        stripped = st_files[0]
        full_names = file_names[0]

        for i in range(1, len(file_names)):
            for k, v in enumerate(file_names[i]):
                try:
                    myIndex = stripped.index(st_files[i][k])
                except IndexError:
                    pass
                    print("\nOOPS\n")
                    print("\nI: {}\nK: {}".format(i, k))
                try:
                    key = full_names[myIndex]
                except IndexError:
                    pass
                    print("\nOOPS 2\n")

                if key not in duplicates:
                    duplicates[key] = []
                duplicates[key].append(v)

        # Extract all files
        for i in range(len(files)):
            path = None
            if i > 0:
                path = 'dup_{}'.format(i)
            else:
                path = ''

            tar.extractall(path=path, members=files[i])

        tar.close()

        self.tar.duplicates = duplicates

        return
Пример #7
0
    def handle_duplicate_files(self):
        # Handle duplicates
        f = Files(self.config)
        dup_list = {}
        duplicates = {}

        files = self.file_names
        dups = self.duplicates

        if len(dups) > 0:
            for i, n in dups.items():
                for j, v in enumerate(n):
                    folder = 'dup_{}'.format(j + 1)
                    delete = False
                    move = False
                    if f.is_same_file(
                            dir_pattern().format(self.stage_path, i),
                            dir_pattern(3).format(self.stage_path, folder, v)):
                        delete = True
                        move = False
                    else:
                        delete = False
                        move = True

                    if delete:
                        os.remove(
                            dir_pattern(3).format(self.stage_path, folder, v))
                    elif move:
                        if i not in dup_list:
                            name = '{}.v1'.format(i)
                            dup_list[i] = [name]
                            src = dir_pattern().format(self.stage_path, i)
                            dst = dir_pattern().format(self.stage_path, name)
                            try:
                                os.rename(src, dst)
                            except OSError:
                                shutil.move(src, dst)

                        num = len(dup_list[i]) + 1
                        name = '{}.v{}'.format(v, num)
                        dup_list[i].append(name)
                        src = dir_pattern(3).format(self.stage_path, folder, v)
                        dst = dir_pattern().format(self.stage_path, name)
                        try:
                            os.rename(src, dst)
                        except OSError:
                            shutil.move(src, dst)

            for i in dup_list:
                if len(dup_list[i]) > 1:
                    key = dir_pattern().format(self.local, i)
                    duplicates[key] = []
                    for j in dup_list[i]:
                        duplicates[key].append(dir_pattern().format(
                            self.local, j))

            self.dups = duplicates

            # Delete directory if now empty
            dupdirs = glob('{}/dup_*'.format(self.stage_path))
            for i in dupdirs:
                f.empty_dir(i)
                os.rmdir(i)

        return False if duplicates == {} else duplicates