def createBlastDB(self, name): ''' wrapper for 'makeblastdb' script of the blast software suit''' # create output folder create_folder(self.DB_OUT) # select all downloaded folders as input for multifasta file creation subfolder = '.' # create multifasta file as input for 'makeblastdb' multi_fasta = self.createBlast_MF(subfolder) try: sys.stdout.write("Create BlastDB ...\n") # run 'makeblastdb' p = subprocess.Popen(shlex.split("%s -in %s -dbtype %s -input_type %s -title %s -out %s %s" % (self.EXECUTABLE, multi_fasta, self.DB_TYPE, 'fasta', name, self.DB_OUT + os.sep + name, self.get_parse_seqids_stmt())), stdout = subprocess.PIPE, stderr = subprocess.PIPE) p.wait() # print statistics sys.stdout.write("Creation of %s BlastDB successfull!%s\nDatabase Location: %s\n" % (self.DB_TYPE, p.stdout.read(), self.DB_OUT + os.sep + name)) except: raise BlastDBException()
def createMetaCVDB(self, name, subfolder): '''wrapper for "metacv formatdb" script to generate a MetaCV database''' # check additional_files and get location taxonomy = self.get_taxonomy() #functional = self.get_functional_annotation() create_folder(self.DB_OUT) # generate multifasta file multi_fasta = self.createMetaCV_MF(subfolder) # needed to for run the external script full_path_exe = os.path.abspath(self.EXECUTABLE) try: sys.stdout.write("Create MetaCV DB ...\n") # metacv cannot pipe the output to other folder, so it # have to be run in the same older as the output os.chdir(self.DB_OUT) # start metacv formatdb with standard parameter p = subprocess.Popen(shlex.split("%s formatdb %s %s %s" % (full_path_exe, '../' + (multi_fasta), ' '.join(map(str,taxonomy)), name))) p.wait() # print statistics sys.stdout.write("Creation of MetaCV DB successfull!\nDatabase Location: %s\n" % (self.DB_OUT + os.sep + name)) except: MetaCVException()
def assert_apply_paths(self): self.checkpoints = os.path.join( self.args.checkpoints, '_'.join([ self.nengine_name, self.args.db_regression_name, 'v' + '.'.join(self.nengine_version), self.args.postfix ])) if not os.path.exists(self.checkpoints): raise Exception( 'base_runner.py: class Runner: def assert_apply_paths(...): error: checkpoints directory {} does not exist.' .format(self.checkpoints)) self.checkpoints_restore = os.path.join( self.checkpoints, self.args.restore_checkpoint_template.format( self.restore_epoch, self.global_step)) self.checkpoints_config = os.path.join(self.checkpoints, 'config') target_list = os.path.splitext(os.path.split(self.apply_list)[-1])[0] self.apply_savedir = os.path.join( self.args.apply_savedir, target_list + '_' + '_'.join([ self.nengine_name, self.args.db_regression_name, 'v' + '.'.join(self.nengine_version), self.args.postfix ])) utils.create_folder(self.checkpoints_config, force=False, raise_except_if_exists=False) utils.create_folder(self.apply_savedir, force=self.args.apply_force)
def save_model(self, mlflow, path): path = os.path.join(path, 'saved_models', f'{self._model_name}_{self._run_time}') create_folder(path) self._save_tokenizer(path) self._save_model(mlflow, path) self._save_embeddings(path)
def train(self): # preprocess data print('preprocessing data') self._preprocess_data() # get embeddings print('getting embeddings weights') self._get_embeddings() # create model architecture self._create_model() summary = self.get_summary() print('mode summary:', summary) self._save_path = os.path.join(self._save_path, 'checkpoints', f'{self._model_name}_{self._run_time}') create_folder(self._save_path) cp_callback = ModelCheckpoint( filepath=self._save_path, save_weights_only=False, # verbose=verbose, save_best_only=True, monitor='val_loss', mode='min') self._history = self._model.fit( self._X_train, self._y_train, epochs=self._epochs, validation_data=self._validation_data, validation_split=self._validation_split, batch_size=self._batch_size, callbacks=[cp_callback])
def get_save_dir(args): if args.debug or args.evaluate or args.extract or args.visualization or args.generate_data or args.generate_label: save_dir = '{}/temp'.format(args.work_dir) else: ct = strftime('%Y-%m-%d %H-%M-%S') save_dir = '{}/{}_{}_{}/{}'.format(args.work_dir, args.config, args.model_type, args.dataset, ct) U.create_folder(save_dir) return save_dir
def main(argv = None): # Setup argument parser parser = ArgumentParser(description = '%s -- create actual bacteria databases from NCBI Sources' % (os.path.basename(sys.argv[0])), epilog = 'created by Philipp Sehnert', add_help = True) parser.add_argument('--version', action = 'version', version = '%s 1.0' % (os.path.basename(sys.argv[0]))) parser.add_argument("-type", dest = "type", default = 'nucl', choices = {'nucl','prot'}, help = "set type of blastdb") parser.add_argument('-metacv', dest = 'metacv', action = 'store_true', default = False, help = 'create metacv database') parser.add_argument('-exe', dest = 'exe', help = "if not installed, specify path to executable of 'makeblastdb' or 'metacv'") parser.add_argument('-name', dest = 'name', default = 'bacterial', required = True, help = 'outname for the databases') parser.add_argument('-parse_seqids', dest = 'parse_seqids', action = 'store_false', default = True, help = 'Remove duplicated GI numbers from downloaded files and run "makeblastdb" with -parse_seqids statement ') # Process arguments args = parser.parse_args() DB_TYPE = args.type METACV = args.metacv DB_NAME = args.name EXECUTABLE = args.exe PARSE_SEQIDS = args.parse_seqids if __name__ == '__main__': # check for protein or nucleotide database DB_TYPE = check_db_type(METACV, DB_TYPE) # verify executable for external scripts EXECUTABLE = check_executable(EXECUTABLE, METACV) # create dir for sources create_folder(DOWNLOAD_FOLDER) # init FTP functions ftp = ftp_functions(FTP_SERVER, FTP_ROOT, DOWNLOAD_FOLDER, DEBUG) # connect to Blast FTP Server ftp.connect() ftp.go_to_root() # start Downloading for ftp_folder in SOURCES: sys.stdout.write("Downloading files from %s \n" % (ftp_folder)) ftp.download_folder(ftp_folder, DB_TYPE) # close ftp connection ftp.close() # run external database creation scripts DBCreate = DBCreation(DB_OUT, DOWNLOAD_FOLDER, DB_TYPE, PARSE_SEQIDS, DEBUG, EXECUTABLE) if METACV: DBCreate.set_METACV(True) # select the subfolder for MetaCV database DBCreate.createMetaCVDB(DB_NAME, ['Bacteria', 'Bacteria_DRAFT']) else: DBCreate.set_METACV(False) DBCreate.createBlastDB(DB_NAME)
def download_folder(self, remote_folder, db_type): '''download a folder with all subfolders and matching files from ftp site''' local = self.DOWNLOAD_FOLDER + os.sep + remote_folder # go to remote dir self.go_down(remote_folder) # get list of subfolder folder_list = self.get_folder_index(remote_folder) # only for cmd output downloaded = actual = all = 0 # init progressbar total = float(len(folder_list)) count = float(downloaded + actual) update_progress(count) # loop over folder in remote_dir for item in folder_list: # create local folder local_folder = local + os.sep + item create_folder(local_folder) # go down in ftp file structure and get a list of matching files self.go_down(item) file_list = self.get_file_index(db_type) # update cmd values all += len(file_list) # update progressbar count += float(1 / total) update_progress(count) # loop over files for x in file_list: # test timstamps if not self.is_actual(local_folder, x): self.download_file(local_folder, x) downloaded += 1 else: actual += 1 # go up file structure self.go_up() # write status information to stdout sys.stdout.write("\nDownloaded: %d Actual: %d Total: %d \n\n" % (downloaded, actual, all)) # go up in file structure to root dir self.go_up()
def assert_to_proto_paths(self): self.checkpoints = os.path.join( self.args.checkpoints, '_'.join([ self.nengine_name, self.args.db_regression_name, 'v' + '.'.join(self.nengine_version), self.args.postfix ])) if not os.path.exists(self.checkpoints): raise Exception( 'base_runner.py: class Runner: def assert_to_proto_paths(...): error: checkpoints directory {} does not exist.' .format(self.checkpoints)) self.checkpoints_restore = os.path.join( self.checkpoints, self.args.restore_checkpoint_template.format( self.restore_epoch, self.global_step)) self.checkpoints_config = os.path.join(self.checkpoints, 'config') self.checkpoints_to_proto = os.path.join(self.checkpoints, 'pb_model') utils.create_folder(self.checkpoints_config, force=False, raise_except_if_exists=False) utils.create_folder(self.checkpoints_to_proto, force=False)
def save_feature_importance(self, method, preds_test=[]): logger.info(method) path = os.path.join('reports/figures/local/', (self.model.get_name() + '_' + self.feature_set)) create_folder(path) filename = self._get_figure_file_name(method, path) explainer = shap.TreeExplainer( self.model.get_model(reinitialize=False)) shap_values = explainer.shap_values(self.X_train) if method == "shap": # shap_values = shap_values[:, :-1] shap.summary_plot(shap_values, self.X_train, show=False) plt.savefig(self._get_figure_file_name('shap', path), bbox_inches='tight') elif method == 'summary': shap.summary_plot(shap_values, self.X_train, plot_type='bar', show=False) plt.savefig(self._get_figure_file_name('summary', path), bbox_inches='tight') elif method == 'feature_importance': feature_score = pd.DataFrame(list( zip(self.X_test.dtypes.index, shap_values.mean(axis=0))), columns=['Feature', 'Score']) feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last') plt.rcParams["figure.figsize"] = (12, 7) ax = feature_score.plot('Feature', 'Score', kind='bar', color='c') ax.set_title("Feature Importance using {}".format(method), fontsize=14) ax.set_xlabel("features") plt.savefig(self._get_figure_file_name(method, path), bbox_inches='tight')
def assert_train_paths(self): self.checkpoints = os.path.join( self.args.checkpoints, '_'.join([ self.nengine_name, self.args.db_regression_name, 'v' + '.'.join(self.nengine_version), self.args.postfix ])) self.checkpoints_config = os.path.join(self.checkpoints, 'config') self.checkpoints_logs = os.path.join(self.checkpoints, 'logs') self.checkpoints_logs_train = os.path.join(self.checkpoints, 'logs', 'train') if self.val_list != '': self.checkpoints_logs_val = os.path.join(self.checkpoints, 'logs', 'val') # [optional] Training restoration if (self.args.restore_epoch is not None) and (self.args.restore_step is not None): self.checkpoints_restore = os.path.join( self.checkpoints, self.args.restore_checkpoint_template.format( self.args.restore_epoch, self.args.restore_step)) create_folder_kwargs = dict(force=False, raise_except_if_exists=False) if self.args.train_force: print( '[WARNING] restore_epoch is not None and restore_step is not None, but train_force is True! Training directories will not be cleaned, training will be restored. If you want to train from scratch, do not set restore_epoch and restore_step arguments' ) else: self.checkpoints_restore = None create_folder_kwargs = dict(force=self.args.train_force) utils.create_folder(self.checkpoints, **create_folder_kwargs) utils.create_folder(self.checkpoints_config, **create_folder_kwargs) utils.create_folder(self.checkpoints_logs, **create_folder_kwargs) utils.create_folder(self.checkpoints_logs_train, **create_folder_kwargs) if self.val_list != '': utils.create_folder(self.checkpoints_logs_val, **create_folder_kwargs)
def create_temporal_folders(self): utils.create_folder(lambda_instance.temporal_folder) utils.create_folder(lambda_instance.input_folder) utils.create_folder(lambda_instance.output_folder)
def setUp(self): self.year = 1913 create_folder('./data/{}/posters'.format(self.year)) create_folder('./data/{}/thumbnails'.format(self.year)) self.dict_imgs_1913 = get_yearly_url_imgs(1913)
def test_create_folder(self): folder_tmp = './dummy-folder' create_folder(folder_tmp) self.assertTrue(os.path.exists(folder_tmp)) os.rmdir(folder_tmp)
def _save(self, data, path, file_name): create_folder(path) print('saving interim dataset') data.to_csv(os.path.join(path, file_name), index=False)