def make_dispatcher(mdata, mdata_resource=None, work_path=None, run_tasks=None, group_size=None): if 'ali_auth' in mdata: pass #from maptool.dispatcher.ALI import ALI #nchunks = len(_split_tasks(run_tasks, group_size)) #dispatcher = ALI(mdata['ali_auth'], mdata_resource, mdata, nchunks) #dispatcher.init(work_path, run_tasks, group_size) #return dispatcher else: try: hostname = mdata['hostname'] context_type = 'ssh' except: context_type = 'local' try: batch_type = mdata['batch'] except: mlog.info('cannot find key "batch" in machine file, try to use deprecated key "machine_type"') batch_type = mdata['machine_type'] try: lazy_local = mdata['lazy_local'] except: lazy_local = False if lazy_local and context_type == 'local': mlog.info('Dispatcher switches to the lazy local mode') context_type = 'lazy-local' disp = Dispatcher(mdata, context_type=context_type, batch_type=batch_type) return disp
def download(self, job_dirs, remote_down_files, check_exists=False, mark_failure=True, back_error=False): cwd = os.getcwd() for ii in job_dirs: local_job = os.path.join(self.local_root, ii) remote_job = os.path.join(self.remote_root, ii) flist = remote_down_files if back_error: os.chdir(remote_job) flist += glob('error*') os.chdir(cwd) for jj in flist: rfile = os.path.join(remote_job, jj) lfile = os.path.join(local_job, jj) if not os.path.realpath(rfile) == os.path.realpath(lfile): if (not os.path.exists(rfile)) and ( not os.path.exists(lfile)): if check_exists: if mark_failure: with open( os.path.join( self.local_root, ii, 'tag_failure_download_%s' % jj), 'w') as fp: pass else: pass else: raise RuntimeError('do not find download file ' + rfile) elif (not os.path.exists(rfile)) and ( os.path.exists(lfile)): # already downloaded pass elif (os.path.exists(rfile)) and ( not os.path.exists(lfile)): # trivial case, download happily shutil.move(rfile, lfile) elif (os.path.exists(rfile)) and (os.path.exists(lfile)): # both exists, replace! mlog.info('find existing %s, replacing by %s' % (lfile, rfile)) if os.path.isdir(lfile): shutil.rmtree(lfile) elif os.path.isfile(lfile) or os.path.islink(lfile): os.remove(lfile) shutil.move(rfile, lfile) else: raise RuntimeError('should not reach here!') else: # no nothing in the case of linked files pass os.chdir(cwd)
def _rmtree(self, sftp, remotepath, level=0, verbose = False): for f in sftp.listdir_attr(remotepath): rpath = os.path.join(remotepath, f.filename) if stat.S_ISDIR(f.st_mode): self._rmtree(sftp, rpath, level=(level + 1)) else: rpath = os.path.join(remotepath, f.filename) if verbose: mlog.info('removing %s%s' % (' ' * level, rpath)) sftp.remove(rpath) if verbose: mlog.info('removing %s%s' % (' ' * level, remotepath)) sftp.rmdir(remotepath)
def ensure_alive(self, max_check = 10, sleep_time = 10): count = 1 while not self._check_alive(): if count == max_check: raise RuntimeError('cannot connect ssh after %d failures at interval %d s' % (max_check, sleep_time)) mlog.info('connection check failed, try to reconnect to ' + self.remote_host) self._setup_ssh(self.remote_host, self.remote_port, username=self.remote_uname, password=self.remote_password) count += 1 time.sleep(sleep)
def all_finished(self, job_handler, mark_failure): task_chunks = job_handler['task_chunks'] task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] job_list = job_handler['job_list'] job_record = job_handler['job_record'] command = job_handler['command'] tag_failure_list = ['tag_failure_%d' % ii for ii in range(len(command))] resources = job_handler['resources'] outlog = job_handler['outlog'] errlog = job_handler['errlog'] backward_task_files = job_handler['backward_task_files'] mlog.debug('checking jobs') nchunks = len(task_chunks) for idx in range(nchunks) : cur_hash = task_hashes[idx] rjob = job_list[idx] if not job_record.check_finished(cur_hash) : # chunk not finished according to record status = rjob['batch'].check_status() job_uuid = rjob['context'].job_uuid mlog.debug('checked job %s' % job_uuid) if status == JobStatus.terminated : job_record.increase_nfail(cur_hash) if job_record.check_nfail(cur_hash) > 3: raise RuntimeError('Job %s failed for more than 3 times' % job_uuid) mlog.info('job %s terminated, submit again'% job_uuid) mlog.debug('try %s times for %s'% (job_record.check_nfail(cur_hash), job_uuid)) rjob['batch'].submit(task_chunks[idx], command, res = resources, outlog=outlog, errlog=errlog,restart=True) elif status == JobStatus.finished : mlog.info('job %s finished' % job_uuid) if mark_failure: rjob['context'].download(task_chunks[idx], tag_failure_list, check_exists = True, mark_failure = False) rjob['context'].download(task_chunks[idx], backward_task_files, check_exists = True) else: rjob['context'].download(task_chunks[idx], backward_task_files) rjob['context'].clean() job_record.record_finish(cur_hash) job_record.dump() job_record.dump() return job_record.check_all_finished()
def build_operation(choice): assert choice in ["1", "2", "3"] if choice == "1": structs, fnames = read_structures() multi_structs(structs, fnames) wait_sep() tip = """ Several options are available: a. A full 3x3 scaling matrix defining the linear combination the old lattice vectors. E.g., 2 1 0 0 1 0 0 0 3 generates a new structure with lattice vectors a' = 2a + b, b' = 3b, c' = c where a, b, and c are the lattice vectors of the original structure. b. An sequence of three scaling factors. E.g., 2 1 1 specifies that the supercell should have dimensions 2a x b x c. c. A number, which simply scales all lattice vectors by the same factor. """ print(tip) wait_sep() in_str = wait() scaling_list = [int(x) for x in in_str.split()] print("scaling list:") print(scaling_list) for struct, fname in zip(structs, fnames): if len(scaling_list) == 1: scales = scaling_list[0] sufix = [scales] elif len(scaling_list) == 3: scales = scaling_list elif len(scaling_list) == 9: scales = [ scaling_list[0:3], scaling_list[3:6], scaling_list[6:9] ] struct_cp = struct.copy() struct_cp.make_supercell(scales) fname = 'maptool_SC_' + fname + '.vasp' struct_cp.to(filename=fname, fmt='poscar') return True elif choice == "2": print('Only support for CNT now !') print('Input the n and m for tube') print('Paramter format, i.e. :') print('3 3') wait_sep() in_str = wait() m, n = [int(i) for i in in_str.split()] atoms = nanotube(m, n, vacuum=15) struct = ase2pmg(atoms) struct.to('POSCAR', 'CNT_' + str(m) + '-' + str(n) + '.vasp') return True else: data = { 'max_index': 2, 'min_vacum': 20, 'min_slab': 8, 'repeat': [3, 3, 1] } def read_adsorb_config(filename): with open(filename, 'r') as f: datas = f.readlines() list_data = [] for i in range(len(datas)): list_data.append( datas[i][0:datas[i].find('#')].strip().split('=')) defined_keys = [ 'method', 'crystal', 'molecule', 'max_index', 'min_vacum', 'min_slab', 'repeat' ] data_dict = {} for key in defined_keys: for li in list_data: if key in li[0]: data_dict[key] = li[1] data_dict['method'] = int(data_dict.get('method').strip()) data_dict['crystal'] = data_dict.get('crystal').strip() data_dict['molecule'] = data_dict.get('molecule').strip() data_dict['max_index'] = int( data_dict.get('max_index', '1').strip()) data_dict['min_vacum'] = int( data_dict.get('min_vacum', '15').strip()) data_dict['min_slab'] = int(data_dict.get('min_slab', '5').strip()) data_dict['repeat'] = [ int(x) for x in data_dict.get('repeat', '1 1 1').strip().split() ] return data_dict def proc_adsorb(cryst, mol, data): if data['method'] == 1: asf_slab = AdsorbateSiteFinder(cryst) ads_sites = asf_slab.find_adsorption_sites() ads_structs = asf_slab.generate_adsorption_structures( mol, repeat=data['repeat']) for i in range(len(ads_structs)): ads_struct = ads_structs[i] try: miller_str = [str(j) for j in cryst.miller_index] except: miller_str = ['adsorb'] filename = '_'.join(miller_str) + '-' + str(i) + '.vasp' ads_struct.to(filename=filename, fmt='POSCAR') else: slabs = generate_all_slabs(cryst, max_index=data['max_index'], min_slab_size=data['min_slab'], min_vacuum_size=data['min_vacum'], lll_reduce=True) for slab in slabs: asf_slab = AdsorbateSiteFinder(slab) ads_sites = asf_slab.find_adsorption_sites() ads_structs = asf_slab.generate_adsorption_structures( mol, repeat=data['repeat']) for i in range(len(ads_structs)): ads_struct = ads_structs[i] miller_str = [str(j) for j in slab.miller_index] filename = 'adsorb' + '_'.join(miller_str) + '-' + str( i) + '.vasp' ads_struct.to(filename=filename, fmt='POSCAR') filename = 'adsorb.cfg' if os.path.exists(filename): data = read_adsorb_config(filename) assert data['method'] in [1, 2] cryst = read_structures_from_file(data['crystal']) mol = read_structures_from_file(data['molecule']) proc_adsorb(cryst, mol, data) else: print('your choice ?') print('{} >>> {}'.format('1', 'read slab from file')) print('{} >>> {}'.format('2', 'build slab by bulk')) wait_sep() in_str = wait() choice = int(in_str) assert choice in [1, 2] data['method'] = choice tips = """\ Input the structure filename of molecule and substrate The first file should be molecule and 2nd for crystal supported structure format: xsf .vasp POSCAR .nc .json .xyz ... paramter format, i.e. : mol.xyz POSCAR""" structs, fnames = read_structures(tips) mol = structs[0] mlog.info("read mol from %s" % (fnames[0])) mlog.info(mol) assert isinstance(mol, Molecule), "the first file should be molecule" cryst = structs[1] mlog.info("read crystal from %s" % (fnames[1])) mlog.info(cryst) assert isinstance(cryst, Structure), "the second file should be crystal" proc_adsorb(cryst, mol, data) return True
def submit_jobs(self, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference = True, outlog = 'log', errlog = 'err') : self.backward_task_files = backward_task_files # task_chunks = [ # [os.path.basename(j) for j in tasks[i:i + group_size]] \ # for i in range(0, len(tasks), group_size) # ] task_chunks = _split_tasks(tasks, group_size) task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] job_record = JobRecord(work_path, task_chunks, fname = self.jrname) nchunks = len(task_chunks) job_list = [] for ii in range(nchunks) : cur_chunk = task_chunks[ii] cur_hash = task_hashes[ii] if not job_record.check_finished(cur_hash): # chunk is not finished # check if chunk is submitted submitted = job_record.check_submitted(cur_hash) if not submitted: job_uuid = None else : job_uuid = job_record.get_uuid(cur_hash) mlog.debug("load uuid %s for chunk %s" % (job_uuid, cur_hash)) # communication context, bach system context = self.context(work_path, self.session, job_uuid) batch = self.batch(context, uuid_names = self.uuid_names) rjob = {'context':context, 'batch':batch} # upload files tag_upload = '%s_tag_upload' % rjob['context'].job_uuid if not rjob['context'].check_file_exists(tag_upload): rjob['context'].upload('.', forward_common_files) rjob['context'].upload(cur_chunk, forward_task_files, dereference = forward_task_deference) rjob['context'].write_file(tag_upload, '') mlog.debug('uploaded files for %s' % task_chunks_str[ii]) # submit new or recover old submission if not submitted: rjob['batch'].submit(cur_chunk, command, res = resources, outlog=outlog, errlog=errlog) job_uuid = rjob['context'].job_uuid mlog.debug('assigned uuid %s for %s ' % (job_uuid, task_chunks_str[ii])) mlog.info('new submission of %s for chunk %s' % (job_uuid, cur_hash)) else: rjob['batch'].submit(cur_chunk, command, res = resources, outlog=outlog, errlog=errlog, restart = True) mlog.info('restart from old submission %s for chunk %s' % (job_uuid, cur_hash)) # record job and its remote context job_list.append(rjob) ip = None instance_id = None if 'ali_auth' in self.remote_profile: ip = self.remote_profile['hostname'] instance_id = self.remote_profile['instance_id'] job_record.record_remote_context(cur_hash, context.local_root, context.remote_root, job_uuid, ip, instance_id) else : # finished job, append a None to list job_list.append(None) job_record.dump() assert(len(job_list) == nchunks) job_handler = { 'task_chunks': task_chunks, 'job_list': job_list, 'job_record': job_record, 'command': command, 'resources': resources, 'outlog': outlog, 'errlog': errlog, 'backward_task_files': backward_task_files } return job_handler