def run(self): manager = PluginManager() config = self.config f = Files(config) cwd = os.getcwd() stage = config['stage'] collection = dir_pattern(3).format(stage, config['job'], 'collection') # Make sure files are supposed to be renamed if config['rename'] == False: return config, self.files # Verify there are no file collisions if self.check_for_collisions(): return config, self.files # Strip the ARM prefix from all of the files print("\nStripping ARM prefix from files... ", end="") sys.stdout.flush() manager.callPluginCommand('hook_rename_preprocess', {'config': config}) os.chdir(collection) sites = set(os.listdir('.')) for site in sites: os.chdir(site) instruments = set(os.listdir('.')) for ins in instruments: os.chdir(ins) files = set(os.listdir('.')) for i in files: new_name = f.rename_file(i) if new_name != None: if i != new_name: self.files[site][ins][new_name] = self.files[site][ ins][i] self.files[site][ins].pop(i) self.files[site][ins][new_name][ 'current_name'] = new_name self.files[site][ins][new_name][ 'stripped_name'] = new_name os.chdir('..') os.chdir('..') manager.callPluginCommand('hook_renamed_files_alter', {'config': config}) print("Done\n") sys.stdout.flush() return config, self.files
def check_for_collisions(self): """ Check all unpacked files for file naming collisions """ print("Checking for file naming collisions...", end="") sys.stdout.flush() config = self.config f = Files(config, self.files) cwd = os.getcwd() collection = dir_pattern(3).format(config['stage'], config['job'], 'collection') os.chdir(collection) sites = os.listdir('.') for site in sites: os.chdir(site) instruments = set(os.listdir('.')) for ins in instruments: os.chdir(ins) files = set(os.listdir('.')) names = self.files[site][ins] # Mark files as deleted for k, v in names.items(): if k not in files: names[k]['deleted'] = True # Check for duplicates for k, v in names.items(): if len(v['duplicate_files']) > 0 and v['deleted'] == False: for i in v['duplicate_files']: name = f.get_file_by_uuid(i) if names[name]['uuid'] == i and names[name][ 'deleted'] == False: config['duplicates'] = True print("Fail") print( "Files with naming collisions still exist.\nPlease resolve these issues before continuing.\n" ) return True os.chdir('..') os.chdir('..') os.chdir(cwd) config['duplicates'] = False print("Done") sys.stdout.flush() return False
def run(self): """ Run the cleanup portion of the cleanup phase """ if self.config['cleanup_status']['archive']['status'] != True: print("Data files must be archived before they can be cleaned up.") self.config['exit'] = True return self.config, self.files stage = self.config['stage'] job = self.config['job'] ################################################################################ # Update local archive database ################################################################################ if not self.config['cleanup_status']['cleanup']['files_archived']: print("Updating local copy of the archive...", end="") # Setup the datastreams to update datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Update the local copy of the archive db if not DEVEL: update_archive(datastreams) print("Done") ################################################################################ # Verify that all files to be added to the archive, were added ################################################################################ print( "Verifying processed and bundled files have been archived...", end="") cwd = os.getcwd() archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' if not os.path.exists(db_file): print("Failed") print( "Unable to connect to the archive database. Please try again later." ) self.config['exit'] = True return self.config, self.files db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) archive_file = dir_pattern(3).format(stage, job, 'current_archive.json') fp = open(archive_file, 'r') oArch = json.loads(fp.read()) fp.close() del fp os.chdir(datastream_path) for site in os.listdir('.'): path = dir_pattern().format(datastream_path, site) os.chdir(site) for folder in os.listdir('.'): os.chdir(folder) args = (folder, start, end) result = db.query(query % args, columns=cols) for f in os.listdir('.'): if not os.path.isdir(dir_pattern().format( os.getcwd(), f)): try: new_version = next(d['file_version'] for d in result if d['file_name'] == f) old_version = next(o['file_version'] for o in oArch[folder] if o['file_name'] == f) if not new_version > old_version: print("Failed") print( "Not all files have been successfully archived. Please try again later." ) self.config['exit'] = True return self.config, self.files except StopIteration: pass os.chdir('..') os.chdir('..') os.chdir(cwd) self.config['cleanup_status']['cleanup']['files_archived'] = True print("Done") ################################################################################ # Remove all files from `<job>/datastream` ################################################################################ if not self.config['cleanup_status']['cleanup']['files_cleaned_up']: print("Cleaning up project files...", end="") # Remove archive.json # Remove current_archive.json # Remove <job>.deletion-list.txt f = Files(self.config) path = dir_pattern().format(stage, job) delete = [ "datastream", "collection", "file_comparison/raw", "file_comparison/tar", 'archive.json', 'current_archive.json', '%s.deletion-list.txt' % job, ] try: for i in delete: item = dir_pattern().format(path, i) if os.path.exists(item): if os.path.isdir(item): f.empty_dir(item) elif os.path.isfile(item): os.remove(item) except: print("Failed") print( "Unable to cleanup all files. Please try again, or cleanup project manually." ) self.config['exit'] = True return self.config, self.files print("Done") self.config['cleanup_status']['cleanup']['files_cleaned_up'] = True self.config['cleanup_status']['cleanup']['status'] = True return self.config, self.files
def run(self): """ Run the remove portion of the cleanup phase """ self.start_time = datetime.now() if not self.config['cleanup_status']['review']['status']: print( "\nData must be reviewed before it can be removed from the archive." ) self.config['exit'] = True return self.config, self.files stage = self.config['stage'] job = self.config['job'] del_file = '%s.deletion-list.txt' % job job_folder = dir_pattern().format(stage, job) exists = False replace = False # Check to see if deletion file exists if os.path.exists(dir_pattern().format(job_folder, del_file)): exists = True ui = UI() replace = ui.yn_choice( '%s already exists.\n Would you like to overwrite this file?' % del_file, 'n') if exists and not replace: return self.config, self.files # Either file doesn't exist or user has chosen to overwrite it # Create <job>.deletion-list.txt file # Reset statuses for this run for k in self.config['cleanup_status']['remove']: self.config['cleanup_status']['remove'][k] = False contents = [] ################################################## # Get list of files from datastream folder ################################################## datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Processed files p_files = {} for k, v in enumerate(datastreams): if v not in p_files: p_files[v] = [] p_files[v] = os.listdir( dir_pattern(3).format(datastream_path, site, v)) ################################################## # Update the local copy of the archive db ################################################## # print("\nUpdating list of files stored at the archive..." # if not DEVEL: # update_archive(datastreams) # print("Done" ################################################## # Get list of files from archive db ################################################## print("\nRetrieving list of relevant files stored at the archive...", end="") # Connect to the database archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' if not os.path.exists(db_file): print( "\nUnable to connect to the archive database. Please try again later." ) self.config['exit'] = True return self.config, self.files db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) # Query the database for each of the datastreams for k, v in enumerate(datastreams): args = (v, start, end) result = db.query(query % args, columns=cols) if len(result) > 0: archive_files[v] = result else: print("\nNo results for %s" % v) # Unset loop variables if len(datastreams) > 0: del k, v, args, result print("Done") print("Map original tar bundle structure...", end="") self.maps['orig']['tar'] = self.get_tar_structure( dir_pattern(3).format(stage, job, "file_comparison/tar")) print("Done") if self.config['ingest']: # Add files to the list that should be removed from the archive print("\nGenerating list of files to remove from the archive...") sys.stdout.flush() ################################################## # Compare raw files to see if they changed ################################################## # Setup Variables for the following code to use file_history = self.files # List of files as they have traveled from tar file through the ingest. Mapped by their current name raw_streams = [ ] # The datastreams that contain the raw files (ex. sgpmfrsrC1.00) # Setup the paths for the ingested and untarred raw files new_folder = dir_pattern(3).format(stage, job, 'datastream') old_folder = dir_pattern(3).format(stage, job, 'file_comparison/raw') raw_files = { } # container to hold a mapping of raw files in the <job>/datastream folder archive_tars = { } # Container to hold a list of tar files at the archive bundle_data = False # Does the raw data in "datastream" need to be bundled # Get a list of the sites in "datastream" for site in os.listdir(new_folder): raw_files[site] = {} # Establish a structure for the raw files in "datastream" # This structure follows the same pattern as 'file_history' for site in raw_files: for instrument in glob( dir_pattern(3).format(new_folder, site, '*.00')): instrument = instrument.split('/')[-1] raw_files[site][instrument] = {} raw_streams.append(instrument) for f in os.listdir( dir_pattern(3).format(new_folder, site, instrument)): raw_files[site][instrument][f] = {} # Compare all of the existing files # By comparing existing files instead of files that were unpacked # we make sure to include all files and can check for files that are not being tracked # (This should never happen) c = Files(self.config) for i, s in raw_files.items(): # i = key, s = site for j, p in s.items(): # j = key, p = process/instrument pbar = UI() percent = 0 pbar.progress(percent) count = len(p) l = 1 for k, f in p.items(): # k = key, f = file # Compare the file in 'datastream' with its counterpart in 'file_comparison/raw' if k not in file_history[i][ j]: # This if statement should never evaluate "True" # File is not being tracked # Raw files in datastream need to be rebundled bundle_data = True # Tar file with this raw file needs to be added to the archive # Make sure the site is in the dict if i not in self.archive['add']['raw']: self.archive['add']['raw'][i] = {j: {}} # Make sure the process is in the dict if j not in self.archive['add']['raw'][i]: self.archive['add']['raw'][i][j] = {} # Add the file to the dict self.archive['add']['raw'][i][j][k] = {} continue # Go to the next iteration of the loop (file cannot be compared because there is no counterpart) # Compare the ingested raw file with the unpacked raw file file_path = dir_pattern(5).format( stage, job, '%s', i, j) file_1 = dir_pattern().format(file_path % 'datastream', k) file_2 = dir_pattern().format( file_path % 'file_comparison/raw', file_history[i][j][k]['original_name']) if not c.is_same_file(file_1, file_2): # The files are not the same. Raw files in datastream need to be rebundled bundle_data = True # Ensure self.archive['remove']['raw'] has the proper structure if i not in self.archive['remove']['raw']: self.archive['remove']['raw'][i] = {j: []} if j not in self.archive['remove']['raw'][i]: self.archive['remove']['raw'][i][j] = [] self.archive['remove']['raw'][i][j].append(k) # Make self.archive['remove']['raw'][i][j] a unique list self.archive['remove']['raw'][i][j] = list( set(self.archive['remove']['raw'][i][j])) percent = int((float(l) / float(count)) * 100) pbar.progress(percent) l = l + 1 percent = int((float(l) / float(count)) * 100) pbar.progress(percent) print("") sys.stdout.flush() # Unset loop variables if len(raw_files) > 0: del i, j, k, s, p, f, c if bundle_data: # Fill self.maps['orig']['history'] and bundle the data for site in file_history: if site not in self.maps['orig']['history']: self.maps['orig']['history'][site] = {} for process in file_history[site]: if process not in self.maps['orig']['history'][site]: self.maps['orig']['history'][site][process] = {} for f, d in file_history[site][process].items(): if d['original_name'] not in self.maps['orig'][ 'history'][site][process]: self.maps['orig']['history'][site][process][ d['original_name']] = d # Find any orig/bad files and copy them over (correcting names as necessary) other_files_path = dir_pattern(3).format( stage, job, 'file_comparison/raw/%s/%s/%s') for i, s in self.maps['orig']['history'].items(): for j, p in s.items(): bad_files = glob(other_files_path % (i, j, '*.bad.*')) orig_files = glob(other_files_path % (i, j, '*.orig.*')) edit_files = glob(other_files_path % (i, j, '*.edit*.*')) # if len(orig_files) > 0: # pbar = UI() # count = len(orig_files) # pbar.progress(0) for k, of in enumerate(orig_files): oFile = of.split('/')[-1] if oFile in p: key = oFile.replace('orig', 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', '.orig.') filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(of, filename) del k, of, oFile, key # print("" # sys.stdout.flush() # if len(bad_files) > 0: # pbar = UI() # count = len(bad_files) # pbar.progress(0) for k, bf in enumerate(bad_files): bFile = bf.split('/')[-1] if bFile in p: key = bFile.replace('bad', 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', '.bad.') else: filename = bFile filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(bf, filename) # # Update progress bar # pbar.progress(int((float(k + 1) / float(count)) * 100)) del k, bf, bFile, key # print("" # sys.stdout.flush() # if len(edit_files) > 0: # pbar = UI() # count = len(edit_files) # pbar.progress(0) for k, ef in enumerate(edit_files): eFile = ef.split('/')[-1] temp = eFile.split('.') edit = None for t in temp: if temp[t].startswith('edit'): edit = temp[t] break if eFile in p: key = eFile.replace(edit, 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', ".%s." % edit) filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(ef, filename) # # Update progress bar # pbar.progress(int((float(k + 1) / float(count)) * 100)) del k, ef, eFile, edit, t, key # print("" # sys.stdout.flush() del j, p del i, s # Create any needed orig files print("Create needed orig files...") sys.stdout.flush() for i, s in self.archive['remove']['raw'].items(): for j, p in s.items(): path = dir_pattern(5).format(stage, job, "datastream", i, j) k = 0 count = len(p) for f in p: orig = f.replace('.raw.', '.orig.') if not os.path.exists(dir_pattern().format( path, orig)): src = dir_pattern(6).format( stage, job, "file_comparison/raw", i, j, file_history[i][j][f]['unpacked_name']) dst = dir_pattern().format(path, orig) shutil.copy(src, dst) # del src, dst percent = int((float(k) / float(count)) * 100) pbar.progress(percent) k = k + 1 if percent < 100: percent = int((float(k) / float(count)) * 100) pbar.progress(percent) print("") # Unset loop variables # del i, s, j, p, path, f, orig, src, dst print("Done") # Bundle the data self.bundle_raw_data(raw_streams) self.config['cleanup_status']['remove']['files_bundled'] = True print("Map new tar bundle structure...", end="") self.maps['new']['tar'] = self.get_tar_structure( dir_pattern(3).format(stage, job, "datastream")) print("Done") print("") print("Mapping raw structure from original tar files...", end="") self.maps['orig']['raw'] = self.map_raw_structure( self.maps['orig']['tar']) print("Done") print("Mapping raw structure from new tar files...", end="") self.maps['new']['raw'] = self.map_raw_structure( self.maps['new']['tar']) print("Done") ################################################## # Find all of the tar files that need # to be removed from the archive ################################################## print("") print( "Generating list of tar files to be removed from the archive..." ) sys.stdout.flush() # Find all of the tar files that need to be removed from the archive for i, s in self.archive['remove']['raw'].items(): percent = 0 for j, p in s.items(): pbar = UI() count = len(p) pbar.progress(percent) k = 1 for raw_file in p: tar_files = self.find_original_tar_bundle( file_history[i][j][raw_file]['original_name'], i, j) for f in tar_files: if f not in self.archive['remove']['tar']: tar = { 'site': i, 'instrument': j, 'file_name': f } self.archive['remove']['tar'].append(tar) percent = int((float(k) / float(count)) * 100) pbar.progress(percent) k = k + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['remove']['raw']) > 0: del i, s, j, p, raw_file, tar_files, f, tar print("Done") ################################################## # Find all of the tar files that need # to be added to the archive ################################################## print("") print( "Generating list of tar files to be added to the archive..." ) pbar = UI() pbar.progress(0) count = len(self.archive['remove']['tar']) percent = 0 i = 1 # Find all of the tar files that need to be added to the archive for tar_file in self.archive['remove']['tar']: files = self.find_all_files_from_original_tar( tar_file['file_name'], tar_file['site'], tar_file['instrument']) for f in files: temp = f if not any(d['file_name'] == temp for d in self.archive['add']['tar']): tar = { 'site': tar_file['site'], 'instrument': tar_file['instrument'], 'file_name': f } self.archive['add']['tar'].append(tar) percent = int((float(i) / float(count)) * 100) pbar.progress(percent) i = i + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['remove']['tar']) > 0: del tar_file, files, f for i, s in self.archive['add']['raw'].items(): for j, p in s.items(): pbar = UI() pbar.progress(0) percent = 0 count = len(p) i = 1 for raw_file, info in p.items(): tar_files = self.find_original_tar_bundle( raw_file, i, j) for f in tar_files: temp = f if not any( d['file_name'] == temp for d in self.archive['add']['tar']): tar = { 'site': i, 'instrument': j, 'file_name': f } self.archive['add']['tar'].append(tar) percent = int((float(i) / float(count)) * 100) pbar.progress(percent) i = i + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['add']['raw']) > 0: del i, s, j, p, raw_file, info, tar_files if 'f' in locals(): del f if 'tar' in locals(): del tar ################################################## # Update archive db for raw datastream ################################################## if not DEVEL: update_archive(raw_streams) # Get list of tar files from the archive for k, v in enumerate(raw_streams): stream = dir_pattern(5).format(stage, job, 'file_comparison/tar', site, v) files = os.listdir(stream) files = "','".join(files) args = (v, files) query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_active = true and file_name in ('%s')" result = db.query(query % args, columns=cols) if len(result) > 0: archive_tars[v] = result else: print("\nNo results for %s" % v) # Unset loop variables del k, v, args, result print("Done generating tar file list") # Find data on tar files in list and add it to 'contents' print("") print("Adding tar files to deletion list...", end="") for f in self.archive['remove']['tar']: files = archive_tars[f['instrument']] for k, v in enumerate(files): if v['file_name'] == f['file_name']: index = k break else: print("\nUnable to find %s in archive db" % f['file_name']) self.config['exit'] = True return self.config, self.files temp = f['file_name'] if not any(d['filename'] == temp for d in contents): contents.append({ 'datastream': f['instrument'], 'filename': f['file_name'], 'hash': files[index]['file_md5'], 'version': files[index]['file_version'] }) if len(self.archive['remove']['tar']) > 0: del f, files, k, v, index pass print("Done") # Set proper file names in deletion list print("Setting proper file names in deletion list...", end="") for k, v in archive_files.items(): if k.split('.')[-1] != '00': for key, f in enumerate(v): if f['file_name'] not in p_files[k]: temp = f['file_name'] pass if not any(d['filename'] == temp for d in contents): contents.append({ 'datastream': k, 'filename': f['file_name'], 'hash': f['file_md5'], 'version': f['file_version'] }) print("Done") # Store the list of files that need to be archived to file archive_json_file = dir_pattern(3).format(stage, job, 'archive.json') fp = open(archive_json_file, 'w') fp.write( json.dumps(self.archive['add']['tar'], indent=2, sort_keys=False, separators=(',', ': '))) fp.close() del fp # Update the saved status self.config['cleanup_status']['remove']['archive_list'] = True ################################################## # Write the results to file # (Use '\r\n' for Windows line endings) ################################################## print("\nEmailing deletion list...", end="") sys.stdout.flush() file_contents = [] contents = sorted(contents, key=self.get_sort_key) for line in contents: l = "%s.v%s %s" % (line['filename'], line['version'], line['hash']) file_contents.append(l) fp = open(dir_pattern().format(job_folder, del_file), 'w') fp.write("\r\n".join(file_contents)) fp.close() del fp # Update the saved status self.config['cleanup_status']['remove']['deletion_list'] = True # Send the deletion list to the appropriate place (currently email, may be upload at a later time) self.email_del_list("%s.deletion-list.txt" % self.config['job']) # self.upload_del_list() print("Done") # Update the saved status self.config['cleanup_status']['remove']['status'] = True duration = datetime.now() - self.start_time print(duration) return self.config, self.files
def run(self): config = self.config manager = self.manager if config['ingest']: # If staging for Ingest # Make sure collection does not have any files that might get overwritten empty = self.check_collection_empty() if not empty: print( "\nFiles currently exist in your collection directory.\nPlease empty {}/{}/collection and try again.\n" .format(config['stage'], config['job'])) config['exit'] = True return config, self.files # cd to the stage directory os.chdir(config['stage']) # Check to see if a plugin needs to modify the datastream temp = manager.callPluginCommand('hook_datastream_alter', {'config': config}) config = temp if temp != None else config # Check to see if a plugin needs to modify the SIF data temp = manager.callPluginCommand('hook_sif_alter', {'config': config}) config = temp if temp != None else config # Establish a database connection db = DB(config) # Get the data_paths data_paths = db.get_data_paths() # Check to see if a plugin needs to modify the data_paths temp = manager.callPluginCommand('hook_data_paths_alter', { 'config': config, 'data_paths': data_paths }) data_paths = temp if temp != None else data_paths # for each instrument for k, v in enumerate(data_paths): archive_path = v['output'] stage_path = v['input'] # Set tar_path and check for plugin modifications tar_path = '{}/{}'.format(config['source'], archive_path) temp = manager.callPluginCommand('hook_tar_path_alter', { 'config': config, 'tar_path': tar_path }) tar_path = temp if temp != None else tar_path if os.path.exists(tar_path): # Get a list of tar files that match specified dates tar = UnPack(config, archive_path, stage_path) tar_files = tar.get_tar_files() temp = manager.callPluginCommand('hook_tar_files_alter', {'config': config}) tar_files = temp if temp != None else tar_files if tar_files and len(tar_files) > 0: # compare_path = '{}/{}/.compare/{}'.format(config['stage'], config['job'], stage_path) compare_path = dir_pattern(5).format( config['stage'], config['job'], 'file_comparison', 'raw', stage_path) tar_backup = dir_pattern(5).format( config['stage'], config['job'], 'file_comparison', 'tar', stage_path) collection_path = '{}/{}/collection/{}'.format( config['stage'], config['job'], stage_path) # Make the above paths if they don't already exist if not os.path.exists(compare_path): os.makedirs(compare_path) if not os.path.exists(tar_backup): os.makedirs(tar_backup) if not os.path.exists(collection_path): os.makedirs(collection_path) # Copy the tar files to the backup location if not tar.copy_files(tar_files, tar_backup): print("Unable to copy tar files") # Unpack the tar files tar.extract_tar_files(tar_files) has_dups = tar.handle_duplicate_files() if has_dups: config['duplicates'] = True for i in has_dups: duplicates[i] = has_dups[i] else: temp = tar_path.split('/') if not config['quiet']: print( '\nData not available for {} using the dates specified' .format(temp[-1])) else: temp = tar_path.split('/') if not config['quiet']: print('\nData for {} does not exist.'.format(temp[-1])) site, process = stage_path.split('/') if self.files == None: self.files = {} if site not in self.files: self.files[site] = {} site = self.files[site] if process not in site: site[process] = {} process = site[process] if os.path.exists( dir_pattern(4).format(self.config['stage'], self.config['job'], 'collection', stage_path)): files = os.listdir( dir_pattern(4).format(self.config['stage'], self.config['job'], 'collection', stage_path)) dup_uuid = {} for i in files: original_name = i temp = i.split('.') if temp[-1][0] == 'v': try: int(temp[-1][1:]) original_name = '.'.join(temp[:-1]) except: pass process[i] = { "uuid": str(uuid.uuid4()), "current_name": i, "original_name": original_name, "stripped_name": None, "processed_name": None, "unpacked_name": i, "duplicate_files": [], "deleted": False, } if original_name != i: dup_uuid[i] = process[i]['uuid'] for i in duplicates: if i.startswith(data_paths[k]['input']): for j in duplicates[i]: site, process, name = j.split('/') for l in duplicates[i]: temp = l.split('/') if j != l: self.files[site][process][name][ 'duplicate_files'].append( dup_uuid[temp[2]]) # Copy the config files from /data/conf to /<stage>/<job>/conf conf_path = "/data/conf/{0}/{0}{1}{2}".format( self.config['site'], self.config['instrument'], self.config['facility']) conf_dest = "{0}/{1}/conf/{2}".format( self.config['stage'], self.config['job'], self.config['site']) dest_folder = "{}{}{}".format(self.config['site'], self.config['instrument'], self.config['facility']) if not os.path.exists(conf_path): conf_path = "/data/conf/{0}/{1}{2}".format( self.config['site'], self.config['instrument'], self.config['facility']) conf_dest = "{0}/{1}/conf/{2}".format( self.config['stage'], self.config['job'], self.config['site']) dest_folder = "{}{}".format(self.config['instrument'], self.config['facility']) if os.path.exists(conf_path): if not os.path.exists(conf_dest): os.makedirs(conf_dest) if os.path.exists(dir_pattern().format( conf_dest, dest_folder)): try: os.rmdir(dir_pattern().format( conf_dest, dest_folder)) except OSError as e: if e.errno == errno.ENOTEMPTY: exit( "Unable to copy config files to {}. Destination is not empty." .format(dir_pattern().format( conf_dest, dest_folder))) else: raise e shutil.copytree( conf_path, dir_pattern().format(conf_dest, dest_folder)) f = Files(self.config) src = dir_pattern(3).format(config['stage'], config['job'], 'collection') # dst = dir_pattern(3).format(config['stage'], config['job'], '.compare') dst = dir_pattern(4).format(config['stage'], config['job'], 'file_comparison', 'raw') if os.path.exists(dst): f.empty_dir(dst) os.rmdir(dst) shutil.copytree(src, dst) if len(duplicates) > 0: print('') print( 'The following files had naming collisions when unpacked.\nPlease verify the contents and keep only the appropriate file(s).' ) print( 'Please do not rename files, simply delete any unwanted files.' ) for i in duplicates: print('') for j in duplicates[i]: print(j) print('') f.save_env() elif config['vap']: f = Files(self.config) f.save_env() vap = VapMgr(self.config) vap.add_to_env() return config, self.files
def run(self): """ Unpack the tar file """ # Setup Vars st_files = self.tar.st_files file_names = self.tar.file_names # files = self.tar.members # temp = self.config[''] files = [] for i in range(len(st_files)): files.append([]) # Open the tar file tar = tarfile.open( dir_pattern().format(self.tar.archive_path, self.file), 'r') # Get the content of the tar file and check for duplicate file names members = tar.getmembers() f = Files(self.config) # Iterate over each tar file for i, m in enumerate(members): # Make sure arrays are not 0 length if len(file_names) == 0: file_names.append([]) if len(files) == 0: files.append([]) if len(st_files) == 0: st_files.append([]) # Iterate over each entry in file_names # Add the file name to the correct array for k, v in enumerate(file_names): sf_names = st_files[k] sn = f.strip_name(m.name) if sn == None or sn == 'orig' or sn == 'bad': sn = m.name if not (m.name in v or sn in sf_names): file_names[k].append(m.name) files[k].append(m) st_files[k].append(sn) break else: file_names.append([m.name]) files.append([m]) st_files.append([sn]) duplicates = {} stripped = st_files[0] full_names = file_names[0] for i in range(1, len(file_names)): for k, v in enumerate(file_names[i]): try: myIndex = stripped.index(st_files[i][k]) except IndexError: pass print("\nOOPS\n") print("\nI: {}\nK: {}".format(i, k)) try: key = full_names[myIndex] except IndexError: pass print("\nOOPS 2\n") if key not in duplicates: duplicates[key] = [] duplicates[key].append(v) # Extract all files for i in range(len(files)): path = None if i > 0: path = 'dup_{}'.format(i) else: path = '' tar.extractall(path=path, members=files[i]) tar.close() self.tar.duplicates = duplicates return
def handle_duplicate_files(self): # Handle duplicates f = Files(self.config) dup_list = {} duplicates = {} files = self.file_names dups = self.duplicates if len(dups) > 0: for i, n in dups.items(): for j, v in enumerate(n): folder = 'dup_{}'.format(j + 1) delete = False move = False if f.is_same_file( dir_pattern().format(self.stage_path, i), dir_pattern(3).format(self.stage_path, folder, v)): delete = True move = False else: delete = False move = True if delete: os.remove( dir_pattern(3).format(self.stage_path, folder, v)) elif move: if i not in dup_list: name = '{}.v1'.format(i) dup_list[i] = [name] src = dir_pattern().format(self.stage_path, i) dst = dir_pattern().format(self.stage_path, name) try: os.rename(src, dst) except OSError: shutil.move(src, dst) num = len(dup_list[i]) + 1 name = '{}.v{}'.format(v, num) dup_list[i].append(name) src = dir_pattern(3).format(self.stage_path, folder, v) dst = dir_pattern().format(self.stage_path, name) try: os.rename(src, dst) except OSError: shutil.move(src, dst) for i in dup_list: if len(dup_list[i]) > 1: key = dir_pattern().format(self.local, i) duplicates[key] = [] for j in dup_list[i]: duplicates[key].append(dir_pattern().format( self.local, j)) self.dups = duplicates # Delete directory if now empty dupdirs = glob('{}/dup_*'.format(self.stage_path)) for i in dupdirs: f.empty_dir(i) os.rmdir(i) return False if duplicates == {} else duplicates