예제 #1
0
 def createBlastDB(self, name):
     ''' wrapper for 'makeblastdb' script of the blast software suit'''
     # create output folder
     create_folder(self.DB_OUT)
     # select all downloaded folders as input for multifasta file creation
     subfolder = '.'
     # create multifasta file as input for 'makeblastdb'
     multi_fasta = self.createBlast_MF(subfolder)
     try: 
         sys.stdout.write("Create BlastDB ...\n")
         # run 'makeblastdb'
         p = subprocess.Popen(shlex.split("%s -in %s -dbtype %s -input_type %s -title %s -out %s %s" 
                                 % (self.EXECUTABLE, 
                                     multi_fasta, 
                                     self.DB_TYPE, 
                                     'fasta', 
                                     name, 
                                     self.DB_OUT + os.sep + name, 
                                     self.get_parse_seqids_stmt())),
                                 stdout = subprocess.PIPE, stderr = subprocess.PIPE)
         p.wait()
         # print statistics 
         sys.stdout.write("Creation of %s BlastDB successfull!%s\nDatabase Location: %s\n" % 
             (self.DB_TYPE, p.stdout.read(), self.DB_OUT + os.sep + name))
     except: 
         raise BlastDBException()
예제 #2
0
    def createMetaCVDB(self, name, subfolder):     
        '''wrapper for "metacv formatdb" script to generate a MetaCV database'''
        # check additional_files and get location
        taxonomy = self.get_taxonomy()
        #functional = self.get_functional_annotation()

        create_folder(self.DB_OUT)
        # generate multifasta file
        multi_fasta = self.createMetaCV_MF(subfolder)
        # needed to for run the external script
        full_path_exe = os.path.abspath(self.EXECUTABLE)
        try:
            sys.stdout.write("Create MetaCV DB ...\n")
            # metacv cannot pipe the output to other folder, so it 
            # have to be run in the same older as the output
            os.chdir(self.DB_OUT)
            # start metacv formatdb with standard parameter
            p = subprocess.Popen(shlex.split("%s formatdb %s %s %s" 
                                            % (full_path_exe,
                                            '../' + (multi_fasta),
                                            ' '.join(map(str,taxonomy)), 
                                            name)))
            p.wait()
            # print statistics 
            sys.stdout.write("Creation of MetaCV DB successfull!\nDatabase Location: %s\n" % 
                (self.DB_OUT + os.sep + name))
        except:
            MetaCVException()
예제 #3
0
    def assert_apply_paths(self):
        self.checkpoints = os.path.join(
            self.args.checkpoints, '_'.join([
                self.nengine_name, self.args.db_regression_name,
                'v' + '.'.join(self.nengine_version), self.args.postfix
            ]))
        if not os.path.exists(self.checkpoints):
            raise Exception(
                'base_runner.py: class Runner: def assert_apply_paths(...): error: checkpoints directory {} does not exist.'
                .format(self.checkpoints))

        self.checkpoints_restore = os.path.join(
            self.checkpoints,
            self.args.restore_checkpoint_template.format(
                self.restore_epoch, self.global_step))
        self.checkpoints_config = os.path.join(self.checkpoints, 'config')

        target_list = os.path.splitext(os.path.split(self.apply_list)[-1])[0]
        self.apply_savedir = os.path.join(
            self.args.apply_savedir, target_list + '_' + '_'.join([
                self.nengine_name, self.args.db_regression_name,
                'v' + '.'.join(self.nengine_version), self.args.postfix
            ]))

        utils.create_folder(self.checkpoints_config,
                            force=False,
                            raise_except_if_exists=False)
        utils.create_folder(self.apply_savedir, force=self.args.apply_force)
예제 #4
0
 def save_model(self, mlflow, path):
     path = os.path.join(path, 'saved_models',
                         f'{self._model_name}_{self._run_time}')
     create_folder(path)
     self._save_tokenizer(path)
     self._save_model(mlflow, path)
     self._save_embeddings(path)
예제 #5
0
    def train(self):

        # preprocess data
        print('preprocessing data')
        self._preprocess_data()

        # get embeddings
        print('getting embeddings weights')
        self._get_embeddings()

        # create model architecture
        self._create_model()
        summary = self.get_summary()
        print('mode summary:', summary)

        self._save_path = os.path.join(self._save_path, 'checkpoints',
                                       f'{self._model_name}_{self._run_time}')
        create_folder(self._save_path)

        cp_callback = ModelCheckpoint(
            filepath=self._save_path,
            save_weights_only=False,
            # verbose=verbose,
            save_best_only=True,
            monitor='val_loss',
            mode='min')

        self._history = self._model.fit(
            self._X_train,
            self._y_train,
            epochs=self._epochs,
            validation_data=self._validation_data,
            validation_split=self._validation_split,
            batch_size=self._batch_size,
            callbacks=[cp_callback])
예제 #6
0
def get_save_dir(args):
    if args.debug or args.evaluate or args.extract or args.visualization or args.generate_data or args.generate_label:
        save_dir = '{}/temp'.format(args.work_dir)
    else:
        ct = strftime('%Y-%m-%d %H-%M-%S')
        save_dir = '{}/{}_{}_{}/{}'.format(args.work_dir, args.config,
                                           args.model_type, args.dataset, ct)
    U.create_folder(save_dir)
    return save_dir
예제 #7
0
def main(argv = None):

    # Setup argument parser
    parser = ArgumentParser(description = '%s -- create actual bacteria databases from NCBI Sources' % 
                            (os.path.basename(sys.argv[0])),
                            epilog = 'created by Philipp Sehnert',
                            add_help = True)
    parser.add_argument('--version', action = 'version', version = '%s 1.0' % 
                        (os.path.basename(sys.argv[0])))
    parser.add_argument("-type", dest = "type", default = 'nucl', 
                        choices = {'nucl','prot'},  help = "set type of blastdb")
    parser.add_argument('-metacv', dest = 'metacv', action = 'store_true',
                        default = False, help = 'create metacv database')
    parser.add_argument('-exe', dest = 'exe', 
                        help = "if not installed, specify path to executable of 'makeblastdb' or 'metacv'")
    parser.add_argument('-name', dest = 'name', default = 'bacterial', required = True,
                        help = 'outname for the databases')
    parser.add_argument('-parse_seqids', dest = 'parse_seqids', action = 'store_false', default = True,
                        help = 'Remove duplicated GI numbers from downloaded files and run "makeblastdb" with -parse_seqids statement ')
    # Process arguments
    args = parser.parse_args()
    DB_TYPE = args.type
    METACV = args.metacv
    DB_NAME = args.name  
    EXECUTABLE = args.exe
    PARSE_SEQIDS = args.parse_seqids
    
    if __name__ == '__main__':
        # check for protein or nucleotide database
        DB_TYPE = check_db_type(METACV, DB_TYPE)
        # verify executable for external scripts
        EXECUTABLE = check_executable(EXECUTABLE, METACV)
        # create dir for sources
        create_folder(DOWNLOAD_FOLDER)
        # init FTP functions
        ftp = ftp_functions(FTP_SERVER, FTP_ROOT, DOWNLOAD_FOLDER, DEBUG)
        # connect to Blast FTP Server 
        ftp.connect()
        ftp.go_to_root()
        # start Downloading
        for ftp_folder in SOURCES:
            sys.stdout.write("Downloading files from %s \n" % (ftp_folder))
            ftp.download_folder(ftp_folder, DB_TYPE)
        # close ftp connection
        ftp.close()
        # run external database creation scripts
        DBCreate = DBCreation(DB_OUT, DOWNLOAD_FOLDER, DB_TYPE, PARSE_SEQIDS, DEBUG, EXECUTABLE)
        if METACV:
            DBCreate.set_METACV(True)
            # select the subfolder for MetaCV database
            DBCreate.createMetaCVDB(DB_NAME, ['Bacteria', 'Bacteria_DRAFT'])
        else:
            DBCreate.set_METACV(False)
            DBCreate.createBlastDB(DB_NAME)     
예제 #8
0
 def download_folder(self, remote_folder, db_type):
     '''download a folder with all subfolders and matching files from ftp site'''
     local = self.DOWNLOAD_FOLDER + os.sep + remote_folder
     # go to remote dir
     self.go_down(remote_folder)
     # get list of subfolder
     folder_list = self.get_folder_index(remote_folder)
     
     # only for cmd output 
     downloaded = actual = all = 0
     # init progressbar
     total = float(len(folder_list))
     count = float(downloaded + actual)
     update_progress(count)
 
     # loop over folder in remote_dir
     for item in folder_list:
         # create local folder
         local_folder = local + os.sep + item
         create_folder(local_folder)
         # go down in ftp file structure and get a list of matching files
         self.go_down(item)            
         file_list = self.get_file_index(db_type)
         # update cmd values
         all += len(file_list)
         # update progressbar
         count += float(1 / total)
         update_progress(count)
         # loop over files
         for x in file_list:
             # test timstamps
             if not self.is_actual(local_folder, x):
                 self.download_file(local_folder, x)
                 downloaded += 1
             else:
                 actual += 1    
         # go up file structure
         self.go_up()
     # write status information to stdout
     sys.stdout.write("\nDownloaded: %d Actual: %d Total: %d \n\n" % 
                     (downloaded, actual, all))
     # go up in file structure to root dir
     self.go_up()
예제 #9
0
    def assert_to_proto_paths(self):
        self.checkpoints = os.path.join(
            self.args.checkpoints, '_'.join([
                self.nengine_name, self.args.db_regression_name,
                'v' + '.'.join(self.nengine_version), self.args.postfix
            ]))
        if not os.path.exists(self.checkpoints):
            raise Exception(
                'base_runner.py: class Runner: def assert_to_proto_paths(...): error: checkpoints directory {} does not exist.'
                .format(self.checkpoints))

        self.checkpoints_restore = os.path.join(
            self.checkpoints,
            self.args.restore_checkpoint_template.format(
                self.restore_epoch, self.global_step))
        self.checkpoints_config = os.path.join(self.checkpoints, 'config')
        self.checkpoints_to_proto = os.path.join(self.checkpoints, 'pb_model')
        utils.create_folder(self.checkpoints_config,
                            force=False,
                            raise_except_if_exists=False)
        utils.create_folder(self.checkpoints_to_proto, force=False)
예제 #10
0
    def save_feature_importance(self, method, preds_test=[]):
        logger.info(method)
        path = os.path.join('reports/figures/local/',
                            (self.model.get_name() + '_' + self.feature_set))
        create_folder(path)
        filename = self._get_figure_file_name(method, path)
        explainer = shap.TreeExplainer(
            self.model.get_model(reinitialize=False))
        shap_values = explainer.shap_values(self.X_train)

        if method == "shap":
            # shap_values = shap_values[:, :-1]
            shap.summary_plot(shap_values, self.X_train, show=False)
            plt.savefig(self._get_figure_file_name('shap', path),
                        bbox_inches='tight')

        elif method == 'summary':
            shap.summary_plot(shap_values,
                              self.X_train,
                              plot_type='bar',
                              show=False)
            plt.savefig(self._get_figure_file_name('summary', path),
                        bbox_inches='tight')

        elif method == 'feature_importance':
            feature_score = pd.DataFrame(list(
                zip(self.X_test.dtypes.index, shap_values.mean(axis=0))),
                                         columns=['Feature', 'Score'])
            feature_score = feature_score.sort_values(by='Score',
                                                      ascending=False,
                                                      inplace=False,
                                                      kind='quicksort',
                                                      na_position='last')
            plt.rcParams["figure.figsize"] = (12, 7)
            ax = feature_score.plot('Feature', 'Score', kind='bar', color='c')
            ax.set_title("Feature Importance using {}".format(method),
                         fontsize=14)
            ax.set_xlabel("features")
            plt.savefig(self._get_figure_file_name(method, path),
                        bbox_inches='tight')
예제 #11
0
    def assert_train_paths(self):
        self.checkpoints = os.path.join(
            self.args.checkpoints, '_'.join([
                self.nengine_name, self.args.db_regression_name,
                'v' + '.'.join(self.nengine_version), self.args.postfix
            ]))
        self.checkpoints_config = os.path.join(self.checkpoints, 'config')
        self.checkpoints_logs = os.path.join(self.checkpoints, 'logs')
        self.checkpoints_logs_train = os.path.join(self.checkpoints, 'logs',
                                                   'train')
        if self.val_list != '':
            self.checkpoints_logs_val = os.path.join(self.checkpoints, 'logs',
                                                     'val')

        # [optional] Training restoration
        if (self.args.restore_epoch is not None) and (self.args.restore_step
                                                      is not None):
            self.checkpoints_restore = os.path.join(
                self.checkpoints,
                self.args.restore_checkpoint_template.format(
                    self.args.restore_epoch, self.args.restore_step))
            create_folder_kwargs = dict(force=False,
                                        raise_except_if_exists=False)
            if self.args.train_force:
                print(
                    '[WARNING] restore_epoch is not None and restore_step is not None, but train_force is True! Training directories will not be cleaned, training will be restored. If you want to train from scratch, do not set restore_epoch and restore_step arguments'
                )
        else:
            self.checkpoints_restore = None
            create_folder_kwargs = dict(force=self.args.train_force)

        utils.create_folder(self.checkpoints, **create_folder_kwargs)
        utils.create_folder(self.checkpoints_config, **create_folder_kwargs)
        utils.create_folder(self.checkpoints_logs, **create_folder_kwargs)
        utils.create_folder(self.checkpoints_logs_train,
                            **create_folder_kwargs)
        if self.val_list != '':
            utils.create_folder(self.checkpoints_logs_val,
                                **create_folder_kwargs)
예제 #12
0
 def create_temporal_folders(self):
     utils.create_folder(lambda_instance.temporal_folder)
     utils.create_folder(lambda_instance.input_folder)
     utils.create_folder(lambda_instance.output_folder)
예제 #13
0
 def setUp(self):
     self.year = 1913
     create_folder('./data/{}/posters'.format(self.year))
     create_folder('./data/{}/thumbnails'.format(self.year))
     self.dict_imgs_1913 = get_yearly_url_imgs(1913)
예제 #14
0
 def test_create_folder(self):
     folder_tmp = './dummy-folder'
     create_folder(folder_tmp)
     self.assertTrue(os.path.exists(folder_tmp))
     os.rmdir(folder_tmp)
예제 #15
0
 def _save(self, data, path, file_name):
     create_folder(path)
     print('saving interim dataset')
     data.to_csv(os.path.join(path, file_name), index=False)