def populate_database_recursive(nc_Database,data,options,find_function,semaphores=None): if 'soft_links' in data.groups.keys(): soft_links=data.groups['soft_links'] paths=soft_links.variables['path'][:] for path_id, path in enumerate(paths): #id_list=['file_type','search'] id_list=['file_type'] for id in id_list: setattr(nc_Database.file_expt,id,soft_links.variables[id][path_id]) #Check if data_node was included: data_node=retrieval_utils.get_data_node(soft_links.variables['path'][path_id], soft_links.variables['file_type'][path_id]) if is_level_name_included_and_not_excluded('data_node',options,data_node): setattr(nc_Database.file_expt,'path','|'.join([soft_links.variables['path'][path_id], soft_links.variables['checksum'][path_id]])) setattr(nc_Database.file_expt,'version','v'+str(soft_links.variables['version'][path_id])) setattr(nc_Database.file_expt,'data_node',data_node) find_function(nc_Database,copy.deepcopy(nc_Database.file_expt),semaphores=semaphores) elif len(data.groups.keys())>0: for group in data.groups.keys(): level_name=data.groups[group].getncattr('level_name') if is_level_name_included_and_not_excluded(level_name,options,group): setattr(nc_Database.file_expt,data.groups[group].getncattr('level_name'),group) populate_database_recursive(nc_Database,data.groups[group],options,find_function,semaphores=semaphores) elif 'path' in data.ncattrs(): #for fx variables: #id_list=['file_type','search'] id_list=['file_type'] for id in id_list: setattr(nc_Database.file_expt,id,data.getncattr(id)) #Check if data_node was included: data_node=retrieval_utils.get_data_node(data.getncattr('path'), data.getncattr('file_type')) if is_level_name_included_and_not_excluded('data_node',options,data_node): checksum='' if 'checksum' in data.ncattrs(): checksum=data.getncattr('checksum') setattr(nc_Database.file_expt,'path','|'.join([data.getncattr('path'), checksum])) setattr(nc_Database.file_expt,'version',str(data.getncattr('version'))) setattr(nc_Database.file_expt,'data_node', retrieval_utils.get_data_node(nc_Database.file_expt.path, nc_Database.file_expt.file_type)) find_function(nc_Database,copy.deepcopy(nc_Database.file_expt)) else: #for retrieved datasets: #id_list=['file_type','search','path','version'] id_list=['file_type','path','version'] for id in id_list: setattr(nc_Database.file_expt,id,'') if len(data.variables.keys())>0: setattr(nc_Database.file_expt,'data_node', retrieval_utils.get_data_node(nc_Database.file_expt.path, nc_Database.file_expt.file_type)) find_function(nc_Database,copy.deepcopy(nc_Database.file_expt)) return
def record_url(remote_file_desc, nc_Database): nc_Database.file_expt.path = remote_file_desc['url'] nc_Database.file_expt.data_node = retrieval_utils.get_data_node( remote_file_desc['url'], remote_file_desc['file_type']) if remote_file_desc[ 'file_type'] in nc_Database.drs.remote_file_types and remote_file_desc[ 'checksum']: nc_Database.file_expt.path += '|' + remote_file_desc['checksum'] else: nc_Database.file_expt.path += '|' for val in nc_Database.drs.remote_fields: setattr(nc_Database.file_expt, val, remote_file_desc[val]) #Convert unicode to string: for val in dir(nc_Database.file_expt): if val[0] != '_' and val != 'case_id': setattr(nc_Database.file_expt, val, str(getattr(nc_Database.file_expt, val))) list_of_knowns = [ getattr(nc_Database.file_expt, field) for field in nc_Database.drs.known_fields ] list_of_retrieved = [ remote_file_desc[field] for field in nc_Database.drs.known_fields ] if remote_file_desc['version']: if (remote_file_desc['version'][1:] != 'atest' and len([ i for i, j in zip(list_of_knowns, list_of_retrieved) if i == j ]) == len(list_of_knowns)): nc_Database.session.add(copy.deepcopy(nc_Database.file_expt)) nc_Database.session.commit() return nc_Database
def order_paths_by_preference(self): #FIND ORDERING: paths_desc=[] for id in self.sorts_list: paths_desc.append((id,np.int32)) for id in self.id_list: paths_desc.append((id,'a255')) paths_ordering=np.empty((len(self.paths_list),), dtype=paths_desc) for file_id, file in enumerate(self.paths_list): paths_ordering['path'][file_id]=file['path'].split('|')[0] #Convert path name to 'unique' integer using hash. #The integer will not really be unique but collisions #should be extremely rare for similar strings with only small variations. paths_ordering['path_id'][file_id]=hash( paths_ordering['path'][file_id] ) paths_ordering['checksum'][file_id]=file['path'].split('|')[1] paths_ordering['version'][file_id]=np.long(file['version'][1:]) paths_ordering['file_type'][file_id]=file['file_type'] paths_ordering['data_node'][file_id]=retrieval_utils.get_data_node(file['path'],paths_ordering['file_type'][file_id]) #Sort paths from most desired to least desired: #First order desiredness for least to most: data_node_order=copy.copy(self.data_node_list)[::-1]#list(np.unique(paths_ordering['data_node'])) file_type_order=copy.copy(self.file_type_list)[::-1]#list(np.unique(paths_ordering['file_type'])) for file_id, file in enumerate(self.paths_list): paths_ordering['data_node_id'][file_id]=data_node_order.index(paths_ordering['data_node'][file_id]) paths_ordering['file_type_id'][file_id]=file_type_order.index(paths_ordering['file_type'][file_id]) #'version' is implicitly from least to most #sort and reverse order to get from most to least: return np.sort(paths_ordering,order=self.sorts_list)[::-1]
def __init__(self,netcdf_file_name,file_type,semaphores): self.file_name=netcdf_file_name self.semaphores=semaphores self.file_type=file_type self.remote_data_node=retrieval_utils.get_data_node(self.file_name, self.file_type) if isinstance(semaphores,dict): self.in_semaphores=(self.remote_data_node in self.semaphores.keys()) else: self.in_semaphores=False self.Dataset=None return
def __init__(self, netcdf_file_name, file_type, semaphores): self.file_name = netcdf_file_name self.semaphores = semaphores self.file_type = file_type self.remote_data_node = retrieval_utils.get_data_node( self.file_name, self.file_type) if isinstance(semaphores, dict): self.in_semaphores = (self.remote_data_node in self.semaphores.keys()) else: self.in_semaphores = False self.Dataset = None return
def __init__(self, search_path, options): self.file_type = "FTPServer" self.options = options self.search_path = search_path.rstrip("/") self.data_node = retrieval_utils.get_data_node(self.search_path, self.file_type) if self.options.username != None and "password" in dir(self.options) and self.options.password != None: # Use credentials: self.ftp = ftplib.FTP(self.data_node.split("/")[2], self.options.username, self.options.password) else: # Do not use credentials and hope for anonymous: self.ftp = ftplib.FTP(self.data_node.split("/")[2]) return
def __init__(self,search_path,options): self.file_type='FTPServer' self.options=options self.search_path=search_path.rstrip('/') self.data_node=retrieval_utils.get_data_node(self.search_path,self.file_type) if (self.options.username!=None and 'password' in dir(self.options) and self.options.password!=None): #Use credentials: self.ftp=ftplib.FTP(self.data_node.split('/')[2],self.options.username,self.options.password) else: #Do not use credentials and hope for anonymous: self.ftp=ftplib.FTP(self.data_node.split('/')[2]) return
def define_queues(options, data_node_list): #from multiprocessing import Manager #manager=Manager() queues = { data_node: multiprocessing.Queue() for data_node in data_node_list } #sem=manager.Semaphore() #semaphores={data_node : manager.Semaphore() for data_node in data_node_list} #semaphores={data_node : sem for data_node in data_node_list} queues['end'] = multiprocessing.Queue() if 'source_dir' in dir(options) and options.source_dir != None: queues[retrieval_utils.get_data_node( options.source_dir, 'local_file')] = multiprocessing.Queue() return queues
def descend_tree(self,database,list_level=None): only_list=[] if self.file_type in database.header['file_type_list']: description={ 'file_type':self.file_type, 'data_node':retrieval_utils.get_data_node(self.search_path,self.file_type), 'time':'0'} file_expt_copy=copy.deepcopy(database.nc_Database.file_expt) for att in description.keys(): setattr(file_expt_copy,att,description[att]) only_list.append(descend_tree_recursive(database,file_expt_copy, [item for item in database.drs.base_drs if not item in description.keys()], self.search_path, self.options,list_level=list_level)) if 'alt_base_drs' in dir(database.drs): only_list.append(descend_tree_recursive(database,file_expt_copy, [item for item in database.drs.alt_base_drs if not item in description.keys()], self.search_path, self.options,list_level=list_level,alt=True)) return [item for sublist in only_list for item in sublist]
def retrieve_without_time(self, retrieval_function, output, semaphores=None, username=None, user_pass=None): #This function simply retrieves all the files: file_path = output for path_to_retrieve in self.paths_list: file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] version = 'v' + str(self.version_list[list( self.paths_list).index(path_to_retrieve)]) checksum = self.checksums_list[list( self.paths_list).index(path_to_retrieve)] #Get the file tree: args = ({ 'path': path_to_retrieve + '|' + checksum, 'var': self.tree[-1], 'file_path': file_path, 'version': version, 'file_type': file_type, 'username': username, 'user_pass': user_pass }, copy.deepcopy(self.tree)) #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice], #Retrieve only if it is from the requested data node: data_node = retrieval_utils.get_data_node(path_to_retrieve, file_type) if nc_Database.is_level_name_included_and_not_excluded( 'data_node', self, data_node): if data_node in self.queues.keys(): #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve print 'Recovering ' + '/'.join(self.tree) self.queues[data_node].put((retrieval_function, ) + copy.deepcopy(args)) return
def record_url(remote_file_desc,nc_Database): nc_Database.file_expt.path=remote_file_desc['url'] nc_Database.file_expt.data_node=retrieval_utils.get_data_node(remote_file_desc['url'],remote_file_desc['file_type']) if remote_file_desc['file_type'] in nc_Database.drs.remote_file_types and remote_file_desc['checksum']: nc_Database.file_expt.path+='|'+remote_file_desc['checksum'] else: nc_Database.file_expt.path+='|' for val in nc_Database.drs.remote_fields: setattr(nc_Database.file_expt,val,remote_file_desc[val]) #Convert unicode to string: for val in dir(nc_Database.file_expt): if val[0]!='_' and val!='case_id': setattr(nc_Database.file_expt,val,str(getattr(nc_Database.file_expt,val))) list_of_knowns=[ getattr(nc_Database.file_expt,field) for field in nc_Database.drs.known_fields] list_of_retrieved=[ remote_file_desc[field] for field in nc_Database.drs.known_fields] if remote_file_desc['version']: if (remote_file_desc['version'][1:]!='atest' and len([i for i,j in zip(list_of_knowns,list_of_retrieved) if i==j])==len(list_of_knowns)): nc_Database.session.add(copy.deepcopy(nc_Database.file_expt)) nc_Database.session.commit() return nc_Database
def retrieve_variables(self, retrieval_function, var_to_retrieve, time_restriction, output, semaphores=None, username=None, user_pass=None): #Replicate variable to output: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): output = netcdf_utils.replicate_netcdf_var(output, self.data_root, var_to_retrieve, chunksize=-1, zlib=True) #file_path=output.filepath() file_path = None if not 'soft_links' in self.data_root.groups.keys(): #Variable is stored here and simply retrieve it: output.variables[ var_to_retrieve][:] = self.data_root.variables[ var_to_retrieve][time_restriction] return else: file_path = output dimensions = dict() unsort_dimensions = dict() dims_length = [] for dim in self.data_root.variables[var_to_retrieve].dimensions: if dim != 'time': if dim in self.data_root.variables.keys(): dimensions[dim] = self.data_root.variables[dim][:] else: dimensions[dim] = np.arange( len(self.data_root.dimensions[dim])) unsort_dimensions[dim] = None dims_length.append(len(dimensions[dim])) # Determine the paths_ids for soft links: paths_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 0] indices_link = self.data_root.groups['soft_links'].variables[ var_to_retrieve][time_restriction, 1] #Convert paths_link to id in path dimension: paths_link = np.array([ list(self.paths_id_list).index(path_id) for path_id in paths_link ]) #Sort the paths so that we query each only once: unique_paths_list_id, sorting_paths = np.unique(paths_link, return_inverse=True) #Maximum number of time step per request: max_request = 450 #maximum request in Mb max_time_steps = max( int( np.floor(max_request * 1024 * 1024 / (32 * np.prod(dims_length)))), 1) for unique_path_id, path_id in enumerate(unique_paths_list_id): path_to_retrieve = self.paths_list[path_id] #Next, we check if the file is available. If it is not we replace it #with another file with the same checksum, if there is one! file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] remote_data = remote_netcdf.remote_netCDF( path_to_retrieve.replace('fileServer', 'dodsC'), file_type, semaphores) if not file_type in ['FTPServer']: path_to_retrieve = remote_data.check_if_available_and_find_alternative( [ path.replace('fileServer', 'dodsC') for path in self.paths_list ], self.checksums_list).replace('dodsC', 'fileServer') #Get the file_type, checksum and version of the file to retrieve: file_type = self.file_type_list[list( self.paths_list).index(path_to_retrieve)] version = 'v' + str(self.version_list[list( self.paths_list).index(path_to_retrieve)]) checksum = self.checksums_list[list( self.paths_list).index(path_to_retrieve)] #Append the checksum: path_to_retrieve += '|' + checksum #time_indices=sorted_indices_link[sorted_paths_link==path_id] time_indices = indices_link[sorting_paths == unique_path_id] num_time_chunk = int( np.ceil(len(time_indices) / float(max_time_steps))) for time_chunk in range(num_time_chunk): time_slice = slice(time_chunk * max_time_steps, (time_chunk + 1) * max_time_steps, 1) dimensions['time'], unsort_dimensions[ 'time'] = indices_utils.prepare_indices( time_indices[time_slice]) #Get the file tree: args = ({ 'path': path_to_retrieve, 'var': var_to_retrieve, 'indices': dimensions, 'unsort_indices': unsort_dimensions, 'sort_table': np.arange(len(sorting_paths))[sorting_paths == unique_path_id][time_slice], 'file_path': file_path, 'version': version, 'file_type': file_type, 'username': username, 'user_pass': user_pass }, copy.deepcopy(self.tree)) #'sort_table':np.argsort(sorting_paths)[sorted_paths_link==path_id][time_slice], #Retrieve only if it is from the requested data node: data_node = retrieval_utils.get_data_node( path_to_retrieve, file_type) if nc_Database.is_level_name_included_and_not_excluded( 'data_node', self, data_node): if data_node in self.queues.keys(): if ((isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)) or time_chunk == 0): #If it is download: retrieve #If it is download_raw: retrieve only first time_chunk if var_to_retrieve == self.tree[-1]: #print 'Recovering '+var_to_retrieve+' in '+path_to_retrieve print 'Recovering ' + '/'.join(self.tree) self.queues[data_node].put((retrieval_function, ) + copy.deepcopy(args)) else: if (isinstance(output, netCDF4.Dataset) or isinstance(output, netCDF4.Group)): #netcdf_utils.assign_tree(output,*getattr(netcdf_utils,retrieval_function)(args[0],args[1])) netcdf_utils.assign_tree( output, *getattr(retrieval_utils, retrieval_function)(args[0], args[1])) return