def sample_specific(self, done_files=[]): my_log = logging.getLogger('train:sample_specific') # check if sample specific data is provided: if self.config.settings["sample_specific_dir"] == "": my_log.debug('no sample specific data') return if utils.dir_is_empty(self.config.settings["sample_specific_dir"]): my_log.debug('no sample specific data') return genomes_exclude_ss = [] n_sequences_ss = 0 my_log.info("Processing sample specific data (all data will be used)...") for e in os.listdir(self.config.settings["sample_specific_dir"]): # valid extension? if len(self.config.settings["extensions"]) != 0: ext = e.split(".")[-1] if ext == "" or ext not in self.config.settings["extensions"]: continue organism = e.split(".", 1)[0] if organism == "": my_log.warning("Invalid sample specific file: {} skipping..".format(e)) continue # map organism if organism in self.nodes: node = organism else: node = self.get_mapped_organism(organism) if node is None: my_log.info("Could not map {} on the tree".format(organism)) continue elif str(node) == "1": my_log.info("Skipping {} due to lack of mapping".format(organism)) continue seq_concat, definition = utils.get_sequence_infos(os.path.join(self.config.settings["sample_specific_dir"], e)) for i in range(len(self.config.settings["fragment_len"])): fl = self.config.settings["fragment_len"][i] try: step = self.config.settings["sample_specific_step"][i] except IndexError: # shgould not happen at all step = fl if step == 0 or step is None: step = fl if len(seq_concat) < fl: my_log.debug("No sample specific data for organism {o} at frag_len {fl}".format(o=organism, fl=fl)) continue number_frags = (len(seq_concat)-fl)/step sample_dir = os.path.join(self.config.settings["project_dir"], "sampled_fasta") fl_dir = os.path.join(sample_dir, str(fl)) fastafile = os.path.join(fl_dir, e) utils.write_fragments(fl, step, seq_concat, definition, node, fastafile, range(number_frags)) # if self.stat is not None: # this is very inefficient at the moment !!! # self.stat.succesfully_written(fastafile) # self.stat.write_backup(self.backupdir) n_sequences_ss += 1 if n_sequences_ss == 0: my_log.error("no data processed in SAMPLE_SPECIFIC_DIR") if len(genomes_exclude_ss) != 0: my_log.info("(excluded {} genomes from SS)".format(str(len(genomes_exclude_ss)))) my_log.info("{} SS sequences done.".format(str(n_sequences_ss)))
async def main(loop): download_timer = Timer() extract_timer = Timer() mod_converge_timer = Timer() apply_converge_timer = Timer() purge_timer = Timer() preproc_timer = Timer() postproc_timer = Timer() load_order_timer = Timer() enable_plugins_timer = Timer() mod_list = [ # Install outside of Data/ FastExit(), FourGBPatch(), OBSE(), ENB(), ENBoost(), MoreHeap(), # OBSE Plugins OneTweak(), # OBSETester(), MenuQue(), ConScribe(), Pluggy(), NifSE(), # Performance Streamline(), OSR(), # Necessary Tweaks ATakeAllAlsoBooks(), DarnifiedUI(), DarnifiedUIConfigAddon(), HarvestFlora(), HarvestContainers(), # Textures QTP3R(), GraphicImprovementProject(), ZiraHorseCompilationModpack(), RingRetexture(), KafeisArmoredCirclets(), KoldornsSewerTextures2(), KoldornsCaveTextures2(), MEAT(), BomretTexturePackForShiveringIslesWithUSIP(), # Gameplay RealisticLeveling(), HUDStatusBars(), UV3(), # Install Last INITweaks(), ArchiveInvalidationInvalidated(), ] converged_paths = {} path_mod_owner = {} # which mod owns which path for path in recurse_files(Config.VANILLA_DIR): converged_paths[str(path).lower()] = Config.VANILLA_DIR / path log.info('downloading') with download_timer: for mod in mod_list: async with aiohttp.ClientSession(loop=loop) as session: await mod.download(session) if False: # stop after download? log.info('stopping after download') return log.info('extracting') with extract_timer: for mod in mod_list: await mod.extract() if False: # stop after extract? log.info('stopping after extract') return log.info('pre-processing') with preproc_timer: for mod in mod_list: await mod.preprocess() log.info('calulcating convergance for each mod') with mod_converge_timer: for mod in mod_list: log.info(f'converging {mod.mod_name}') for source_path, dest_path in mod.modify(): if not isinstance(dest_path, Path): raise Exception(f'{dest_path} is not a Path!') elif dest_path.is_absolute(): raise Exception(f'{dest_path} is not absolute') if not isinstance(source_path, Path): raise Exception(f'{source_path} is not a Path!') elif not source_path.is_absolute(): raise Exception(f'{source_path} is not absolute') elif not source_path.exists(): raise Exception(f'{source_path} does not exist, bad modify code') elif not source_path.is_file(): raise Exception(f'{source_path} is not a regular file.') converged_paths[str(dest_path).lower()] = source_path path_mod_owner[str(dest_path).lower()] = mod.mod_name log.info('applying convergance') with apply_converge_timer: for dest_path, source_path in converged_paths.items(): dest_path = Config.game.root_dir / dest_path if not dest_path.exists() or not samefile(str(dest_path), str(source_path)): if dest_path.exists(): dest_path.unlink() # FIXME move to purged dir? dest_path.parent.mkdir(exist_ok=True, parents=True) try: create_hardlink(str(source_path), str(dest_path)) except FileNotFoundError: raise Exception(f'failed to hard link {source_path} to {dest_path} {source_path} (or {dest_path.parent}) not found') log.info('purging') with purge_timer: try: with Config.mod_ownership_path.open('rb') as f: old_path_mod_owner = json.load(f) except FileNotFoundError: old_path_mod_owner = {} purged_root = Config.PURGED_DIR / datetime.now().isoformat().replace(':', '') for path in recurse_files(Config.game.root_dir): if ( str(path).lower() not in converged_paths and not path.suffix.lower() in {'.ini', '.cfg', '.json', '.log'} and #TODO don't purge xml files unless they're menu files not path.parts[0].lower() in {'obmm', 'mopy'} ): if str(path).lower() in old_path_mod_owner: (Config.game.root_dir / path).unlink() else: purged_path = purged_root / path purged_path.parent.mkdir(exist_ok=True, parents=True) (Config.game.root_dir / path).rename(purged_path) log.info('purging empty directories') for d in recurse_dirs(Config.game.root_dir): if dir_is_empty(d): d.rmdir() log.info('postprocessing') with postproc_timer: for mod in mod_list: await mod.postprocess() log.info('Done Applying Changes') log.info('modifying load order') with load_order_timer: boss_uninstall_string = get_regkey('HKLM', r'SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\BOSS', 'UninstallString') boss_install_location = Path(shlex.split(boss_uninstall_string)[0]).parent boss_exe_path = boss_install_location / 'boss.exe' proc = await asyncio.create_subprocess_exec( str(boss_exe_path), '-s', '-g', Config.game.BOSS_NAME, cwd=str(boss_install_location), stderr=sys.stderr, stdout=sys.stdout, ) await proc.wait() log.info('enabling all .esp and .esm files') with enable_plugins_timer: PLUGINS_HEADER = textwrap.dedent(''' # This file is used to tell Oblivion which data files to load. # WRITE YOUR OWN PYTHON SCRIPT TO MODIFY THIS FILE (lol) # Please do not modify this file by hand. ''').strip() with atomic_save(str(Config.game.app_data_path / 'plugins.txt')) as f: with io.TextIOWrapper(f, 'ascii') as ef: ef.write(PLUGINS_HEADER) ef.write('\n') for esm in Config.game.root_dir.glob('Data/*.esm'): ef.write(esm.name) ef.write('\n') for esp in Config.game.root_dir.glob('Data/*.esp'): ef.write(esp.name) ef.write('\n') log.info('saving which mod owns which file') with atomic_save(str(Config.mod_ownership_path)) as f: with io.TextIOWrapper(f, 'ascii') as ef: json.dump(path_mod_owner, ef) log.info(f'download_timer = {download_timer}') log.info(f'extract_timer = {extract_timer}') log.info(f'mod_converge_timer = {mod_converge_timer}') log.info(f'apply_converge_timer = {apply_converge_timer}') log.info(f'purge_timer = {purge_timer}') log.info(f'preproc_timer = {preproc_timer}') log.info(f'postproc_timer = {postproc_timer}') log.info(f'load_order_timer = {load_order_timer}') log.info(f'enable_plugins_timer = {enable_plugins_timer}')
def process_ncbi(self): """ download NCBI sequences, parse gbk files to fasta format, rename sequences and sort per genome """ my_log = logging.getLogger('train:process_ncbi') if utils.dir_is_empty(self.config.settings["ncbi_processed_dir"]): my_log.warning("The NCBI_PROCESSED_DIR is empty, it is possible to download data from NCBI.") my_log.info("I can automatically download Bacterial & Archael data " "(for more possibilities see INSTALL.txt).") if self.yes: answ = "y" else: my_log.info("Download sequence data from NCBI? [Y/N] (default=Y, timeout 2 minutes)") answ = utils.get_answer_timeout() if answ != "y": error_message = "There is no training data available, provide some and run the program again." \ "Read INSTALL.txt for details on how this can be done." my_log.critical(error_message) sys.exit(1) my_log.info("Download may take some time ...") os.mkdir(os.path.join(self.config.settings["project_dir"], "tmp")) tmp_dir = os.path.join(self.config.settings["project_dir"], 'tmp') data_archive = os.path.join(tmp_dir, 'all.gbk.tar.gz') success = os.system("wget -O {} ftp://ftp.ncbi.nih.gov/genomes/Bacteria/all.gbk.tar.gz".format(data_archive)) if success != 0: my_log.critical("Error in downloading sequence data from NCBI.") sys.exit(1) # unpack data unpack_cmd = "tar xfz {a} -C {tmp}".format(a=data_archive, tmp=tmp_dir) success = os.system(unpack_cmd) if success != 0: my_log.critical("Error in unpacking the downloaded sequence data.") sys.exit(1) # process the data and create the fasta files in ncbi_dir process_object = process_ncbi.Process_NCBI(tmp_dir, self.config.settings["ncbi_processed_dir"]) success = process_object.run() if not success: sys.exit(1) # clean the dowloaded NCBI data shutil.rmtree(tmp_dir) # get all the organism names from the files in the ncbi_dir # this can be used for generating generic clades n_sequences = 0 files = glob.glob("{dir}{sep}*.*".format(dir=self.config.settings["ncbi_processed_dir"], sep=os.path.sep)) for f in files: ext = f.split(".")[-1] if len(self.config.settings["extensions"]) > 0 or \ (len(self.config.settings["extensions"]) == 1 and len(self.config.settings["extensions"][0]) > 0): # extensions entweder [] oder [""] if ext is not None and ext not in self.config.settings["extensions"]: continue if "." not in f: my_log.debug("Invalid file: {}..skipping".format(f)) continue else: organism = f.split(os.path.sep)[-1].split(".")[0] # exclude this genome if asked to if organism in self.config.genomes_exclude: self.genomes_excluded.append(organism) continue n_sequences += 1 self.organisms.add(organism) if organism not in self.organism_file_map: self.organism_file_map[organism] = [f] else: self.organism_file_map.append(f) if len(self.genomes_excluded) is not 0: my_log.info("excluded {} sequences.".format(str(len(self.genomes_excluded))))