예제 #1
0
    def __init__(self, pos_fasta, neg_fasta, output_path, segmentation_schemes=10, topN=100):
        '''

        '''
        if not isinstance(pos_fasta, str):
            self.pos=pos_fasta
        elif pos_fasta.split('.')[-1]=='txt':
            self.pos=FileUtility.load_list(pos_fasta)
        elif pos_fasta.split('.')[-1]=='fasta':
            self.pos=FileUtility.read_fasta_sequences(pos_fasta)
        if not isinstance(neg_fasta, str):
            self.neg=neg_fasta
        elif neg_fasta.split('.')[-1]=='txt':
            self.neg=FileUtility.load_list(neg_fasta)
        elif neg_fasta.split('.')[-1]=='fasta':
            self.neg=FileUtility.read_fasta_sequences(neg_fasta)
        self.seqs=[seq.lower() for seq in self.pos+self.neg]
        self.labels=[1]*len(self.pos)+[0]*len(self.neg)
        self.segmentation_schemes=segmentation_schemes
        self.load_alpha_distribution()
        self.prepare_segmentations()
        print (output_path)
        FileUtility.ensure_dir(output_path)
        self.output_path=output_path
        self.motif_extraction(topN)
    def generate_LR_important_features(self,
                                       clf_LR,
                                       feature_names,
                                       results_file,
                                       N=1000):
        '''
        :param clf_logistic_regression:
        :param feature_names:
        :param results_file:
        :param N:
        :return:
        '''

        results_file = results_file.replace(
            '/classifications/', '/feature_selection/classifications/')
        FileUtility.ensure_dir(results_file)
        file_name = results_file + '_LR'

        idxs = argsort(np.abs(clf_LR.coef_.tolist()[0]).tolist(),
                       rev=True)[0:N]

        f = codecs.open(file_name, 'w')
        f.write('\t'.join(['feature', 'score']) + '\n')
        for idx in idxs:
            f.write('\t'.join(
                [feature_names[idx],
                 str(clf_LR.coef_.tolist()[0][idx])]) + '\n')
        f.close()
예제 #3
0
 def __init__(self, output_path):
     '''
         Constructor
     '''
     # set the parameters
     self.output_path = output_path
     FileUtility.ensure_dir(self.output_path + '/biblecom_intermediate/')
     FileUtility.ensure_dir(self.output_path + '/reports/')
예제 #4
0
 def download_zipfile(self, url_outpath_rec):
     try:
         url, outpath, iso, code, langname = url_outpath_rec
         FileUtility.ensure_dir(outpath)
         r = requests.get(url)
         z = zipfile.ZipFile(io.BytesIO(r.content))
         z.extractall(outpath)
         temp = PNGScriptRetrieve(
             (url, outpath,
              '../../' + iso + '_' + code.replace('_', '-') + '.png.txt'),
             crawl=False,
             parse=True)
         return url, [iso, code.replace('_', '-'), langname]
     except:
         return url, False
예제 #5
0
    def __init__(self, output_path):
        '''
            Constructor
        '''
        # set the parameters
        self.output_path = output_path
        FileUtility.ensure_dir(self.output_path +
                               '/pngscripture_intermediate/')
        FileUtility.ensure_dir(self.output_path + '/reports/')

        def warn(*args, **kwargs):
            pass

        import warnings
        warnings.warn = warn
예제 #6
0
 def __init__(self, key, output_path):
     '''
         Constructor
     '''
     # set the parameters
     self.key = key
     self.output_path = output_path
     FileUtility.ensure_dir(self.output_path + '/api_intermediate/')
     FileUtility.ensure_dir(self.output_path + '/reports/')
     self.to_double_check = list()
     # check the API connection
     response = requests.get('https://dbt.io/api/apiversion?key=' +
                             self.key + '&v=2')
     if response.status_code != 200:
         print('Enter a correct API code')
         return False
     else:
         response = json.loads(response.content)
         print('Connected successfully to the bible digital platform v ' +
               response['Version'])
         self.load_book_map()
    def generate_RF_important_features(self,
                                       clf_random_forest,
                                       feature_names,
                                       results_file,
                                       N=1000):
        '''
        :param clf_random_forest:
        :param feature_names:
        :param results_file:
        :param N:
        :return:
        '''

        results_file = results_file.replace(
            '/classifications/', '/feature_selection/classifications/')
        FileUtility.ensure_dir(results_file)
        file_name = results_file + '_RF'
        clf_random_forest.fit(self.X, self.Y)
        std = np.std([
            tree.feature_importances_ for tree in clf_random_forest.estimators_
        ],
                     axis=0)

        scores = {
            feature_names[i]: (s, std[i])
            for i, s in enumerate(list(clf_random_forest.feature_importances_))
            if not math.isnan(s)
        }
        scores = sorted(scores.items(),
                        key=operator.itemgetter([1][0]),
                        reverse=True)[0:N]
        f = codecs.open(file_name, 'w')
        f.write('\t'.join(['feature', 'score']) + '\n')
        for w, score in scores:
            #feature_array = self.X[:, feature_names.index(w)]
            #pos = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 1]
            #neg = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 0]
            f.write('\t'.join([str(w), str(score[0])]) + '\n')
        f.close()
예제 #8
0
    def generate_tree(self, path, name):

        path_g = path + '/graphlan_files/'
        FileUtility.ensure_dir(path_g)
        font_map = {1: 15, 2: 14, 3: 13, 4: 12, 5: 8, 6: 7, 7: 4}
        taxonomy = self.get_pandas_df()['taxonomy'].tolist()
        direction = self.get_pandas_df()['direction'].tolist()
        taxlev = self.get_pandas_df()['taxonomylevel'].tolist()

        logpval = [
            round(-np.log(x)) for x in self.get_pandas_df()['pvalue'].tolist()
        ]

        taxonomy = [
            '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy
        ]
        tax_freq = dict(FreqDist(taxonomy).most_common())
        logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)]

        #taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy]

        dict_color = dict()
        for idx, x in enumerate(direction):
            if len(taxonomy[idx].split('.')) > 5:
                coloring = ('r' if x == '+' else ('b' if x == '-' else 'g'))
                if taxonomy[idx].split('.')[-1] in dict_color:
                    dict_color[taxonomy[idx].split('.')[-1]].append(coloring)
                else:
                    dict_color[taxonomy[idx].split('.')[-1]] = [coloring]

        new_dict_color = dict()
        for tax, colors in dict_color.items():
            freq = FreqDist(colors)
            if freq['r'] / (freq['r'] + freq['b']) > 0.8:
                new_dict_color[tax] = 'r'
            elif freq['b'] / (freq['r'] + freq['b']) > 0.8:
                new_dict_color[tax] = 'b'
            else:
                new_dict_color[tax] = 'w'
        dict_color = new_dict_color

        annot = [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                dict_color[taxonomy[idx].split('.')[-1]]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
        ]

        #annot=['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color',('r' if x=='+' else ('b' if x=='-' else 'g'))])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                'w'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) == 5
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation',
                taxonomy[idx].split('.')[-1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]

        #annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color','purple'])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]

        ## OUTER RINGS
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation',
                taxonomy[idx].split('.')[1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_font_size',
                 str(9)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation_background_color',
                '#eedbfc'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]

        ## Clades
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_size',
                str(logpval_frq[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_edge_width',
                str(logpval[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]

        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[-1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_font_size',
                str(font_map[taxlev[idx]])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + ['annotation_background_offset\t0.5']
        annot = annot + ['clade_marker_edge_color\t#4f1a49']
        annot = annot + ['branch_color\t#4f1a49']
        annot = annot + ['annotation_background_separation\t-0.01']
        annot = annot + ['annotation_background_width\t0.2']

        #https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt  ../new.xml
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15 --external_legends
        taxonomy = [
            x for x in taxonomy if len(x.split('.')) > 5
            if not dict_color[x.split('.')[-1]] == 'w'
        ]

        FileUtility.save_list(path_g + name + '_taxonomy.txt', taxonomy)
        FileUtility.save_list(path_g + name + '_annot.txt', annot)

        subprocess.call("python3 graphlan/graphlan_annotate.py --annot " +
                        path_g + name + '_annot.txt' + " " + path_g + name +
                        '_taxonomy.txt' + "  " + path_g + name + '.xml',
                        shell=True)
        subprocess.call("python3 graphlan/graphlan.py " + path_g + name +
                        '.xml' + " " + path + name +
                        '.pdf --dpi 1000 --size 15 --external_legends',
                        shell=True)
        try:
            FileUtility.remove(path + name + '_legend.pdf')
        except:
            print('')
예제 #9
0
    def predict_block(self, ultimate=False):
        '''
        :return:
        '''
        import warnings
        from sklearn.exceptions import DataConversionWarning, FitFailedWarning, UndefinedMetricWarning, ConvergenceWarning
        warnings.filterwarnings(action='ignore', category=DataConversionWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        
        predict_blocks = self.xmldoc.getElementsByTagName('predict')
        predict_path=self.output+'/classifications/'

        # iterate over predict block
        for predict in predict_blocks:
            # Sub prediction
            FileUtility.ensure_dir(predict_path)
            setting_name=predict.attributes['name'].value
            subdir=predict_path+setting_name+'/'

            FileUtility.ensure_dir(subdir)
            ## label mapping
            labels=predict.getElementsByTagName('labels')[0].getElementsByTagName('label')
            mapping=dict()
            for label in labels:
                val=label.attributes['value'].value
                phenotype=label.firstChild.nodeValue.strip()
                mapping[phenotype]=int(val)

            ## optimizing for ..
            optimization=predict.getElementsByTagName('optimize')[0].firstChild.nodeValue.strip()
            ## number of folds
            self.cvbasis=predict.getElementsByTagName('eval')[0].firstChild.nodeValue.strip()
            folds=int(predict.getElementsByTagName('eval')[0].attributes['folds'].value)
            test_ratio=float(predict.getElementsByTagName('eval')[0].attributes['test'].value)

            if optimization not in ['accuracy','scores_r_1','scores_f1_1','scores_f1_0','f1_macro','f1_micro']:
                print ('Error in choosing optimization score')

            ## Genotype tables
            GPA=GenotypePhenotypeAccess(self.output)
            ## iterate over phenotypes if there exist more than one
            for phenotype in GPA.phenotypes:
                print ('working on phenotype ',phenotype)
                FileUtility.ensure_dir(subdir+phenotype+'/')
                ## create cross-validation
                FileUtility.ensure_dir(subdir+phenotype+'/cv/')
                cv_file=''
                cv_test_file=''
                if not ultimate:
                    if self.cvbasis=='tree':
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/tree/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_treefold(subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt']), self.metadata_path + 'phylogentictree.txt', folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/tree/'+''.join([phenotype,'_',setting_name,'_test.txt'])
                    else:
                        FileUtility.ensure_dir(subdir+phenotype+'/cv/rand/')
                        if self.override or not FileUtility.exists(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])):
                            GPA.create_randfold(subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt']), folds, test_ratio, phenotype, mapping)
                        cv_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_folds.txt'])
                        cv_test_file=subdir+phenotype+'/cv/rand/'+''.join([phenotype,'_',setting_name,'_test.txt'])

                features=[x.split('/')[-1].replace('_feature_vect.npz','') for x in FileUtility.recursive_glob(self.representation_path, '*.npz')]
                feature_combinations=[]
                ## TODO: ask as an input
                max_length_feature_comb = 3#len(features)

                for x in [[list(x) for x in list(itertools.combinations(features,r))] for r in range(3,max_length_feature_comb+1)]:
                    feature_combinations+=x


                ## iterate over feature sets
                for feature_setting in feature_combinations:
                    classifiers=[]
                    for model in predict.getElementsByTagName('model'):
                        for x in model.childNodes:
                            if not x.nodeName=="#text":
                                classifiers.append(x.nodeName)
                    if not ultimate:
                        X, Y, feature_names, final_strains = GPA.get_xy_prediction_mats(feature_setting, phenotype, mapping)

                        feature_setting =[''.join(feature.split('.')[0:-1]) if len(feature.split('.'))>1 else feature for feature in feature_setting]
                        feature_text='##'.join(feature_setting)

                        ## iterate over classifiers
                        for classifier in tqdm.tqdm(classifiers):
                            basepath_cls=subdir+phenotype+'/'+feature_text+'_CV_'+self.cvbasis
                            if classifier.lower()=='svm' and (not FileUtility.exists(basepath_cls+'_SVM.pickle') or self.override):
                                Model = SVM(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names, params=[{'C': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}])
                            if classifier.lower()=='rf' and  (not FileUtility.exists(basepath_cls+'_RF.pickle') or self.override):
                                Model = RFClassifier(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            if classifier.lower()=='lr' and (not FileUtility.exists(basepath_cls+'_LR.pickle') or self.override):
                                Model = LogRegression(X, Y)
                                Model.tune_and_eval_predefined(basepath_cls, final_strains, folds_file=cv_file, test_file=cv_test_file,njobs=self.cores, feature_names=feature_names)
                            #if classifier.lower()=='dnn':
                            #    Model = DNN(X, Y)
                            #    Model.tune_and_eval(subdir+phenotype+'/'+'_'.join([feature]),njobs=self.cores, kfold=10)
                        # generate selected features
                        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
                        print ('Select the top markers..')
                        generate_top_features(self.output, [x.upper() for x in classifiers], topk=200)
                FileUtility.ensure_dir(subdir+phenotype+'/'+'final_results/')
                #create_excel_file(subdir+phenotype+'/', subdir+phenotype+'/final_results/classification_res.xlsx')


        FileUtility.ensure_dir(self.output+'/'+'ultimate_outputs/')
예제 #10
0
def training_loop(**kwargs):
    run_parameters = kwargs['run_parameters']
    model_paramters = kwargs['model_paramters']
    model = eval(kwargs['deep_learning_model'])

    # which GPU to use
    os.environ["CUDA_VISIBLE_DEVICES"] = str(run_parameters['gpu'])

    # read files
    train_file = 'datasets/train.txt'
    test_file = 'datasets/test.txt'
    LD = LabelingData(train_file, test_file)
    train_lengths = [int(j) for j in FileUtility.load_list('/'.join(train_file.split('/')[0:-1]) + '/train_length.txt')]
    test_lengths = [int(i) for i in FileUtility.load_list('/'.join(test_file.split('/')[0:-1]) + '/test_length.txt')]

    # train/test batch parameters
    train_batch_size = run_parameters['train_batch_size']
    test_batch_size = run_parameters['test_batch_size']
    patience = run_parameters['patience']
    epochs = run_parameters['epochs']

    # model
    model, params = model(LD.n_classes, **model_paramters)

    # output directory
    FileUtility.ensure_dir('results/')
    FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/')
    FileUtility.ensure_dir('results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/')
    FileUtility.ensure_dir(
        'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/')
    full_path = 'results/' + run_parameters['domain_name'] + '/' + run_parameters['setting_name'] + '/' + params + '/'

    # save model
    with open(full_path + 'config.txt', 'w') as fh:
        model.summary(print_fn=lambda x: fh.write(x + '\n'))

    # check points
    filepath = full_path + "/weights-improvement-{epoch:02d}-{weighted_acc:.3f}-{val_weighted_acc:.3f}.hdf5"

    checkpoint = ModelCheckpoint(filepath, monitor='val_weighted_acc', verbose=1, save_best_only=True, mode='max',
                                 period=1)
    earlystopping = EarlyStopping(monitor='val_weighted_acc', min_delta=0, patience=patience, verbose=0, mode='max',
                                  baseline=None)
    callbacks_list = [checkpoint, earlystopping]

    # calculate the sizes
    steps_per_epoch = len(train_lengths) / train_batch_size if len(train_lengths) % train_batch_size == 0 else int(
        len(train_lengths) / train_batch_size) + 1
    validation_steps = int(len(test_lengths) / test_batch_size) if len(test_lengths) % test_batch_size == 0 else int(
        len(test_lengths) / test_batch_size) + 1

    # feed model
    h = model.fit_generator(train_batch_generator_408(train_batch_size), steps_per_epoch=steps_per_epoch,
                            validation_data=validation_batch_generator_408(test_batch_size),
                            validation_steps=validation_steps,
                            shuffle=False, epochs=epochs, verbose=1, callbacks=callbacks_list)

    # Analysis of the performance
    pred_test = [(model.predict_on_batch(x),y,w) for x,y,w in tqdm.tqdm(validation_batches_fortest_408(1))]

    acc_test, conf_mat, conf_mat_column_mapping, contingency_metric, chi2_res_pval, gtest_res_pval = generate_report(pred_test)

    # save the history
    FileUtility.save_obj(full_path + 'history', h.history)
예제 #11
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             phenoname,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None,
                             excel=0):
        '''

        :return:
        '''
        print('\t✔ NPE Marker detection is started..')
        start = time.time()
        rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                  'npe_marker_files/')

        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory_inter + 'npe_marker_files/' +
                '_'.join([phenoname, 'chi2_relative.fasta'])):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                G16s = NPEMarkerDetection(
                    rep_base_path + '.npz',
                    rep_base_path + '_' + phenoname + '_Y.txt',
                    rep_base_path + '_features', self.output_directory_inter +
                    'npe_marker_files/' + phenoname, selected_samples)
                G16s.extract_markers()

            end = time.time()
            spent = end - start
            print('\t✔ biomarker extraction ' + phenoname + '  ' + str(spent) +
                  ' seconds , using ' + str(self.num_p) + ' cores')
            self.log_file.append('biomarker extraction ' + phenoname + '  ' +
                                 str(spent) + ' seconds , using ' +
                                 str(self.num_p) + ' cores')
        else:
            print(
                '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed'
            )
            self.log_file.append(
                ' Biomarker are already extracted. Thus, the statistical test was bypassed'
            )

        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        print('\t✔ Taxonomic assignment of the markers..')

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        FileUtility.ensure_dir(self.output_directory +
                               'final_outputs/save_states/')
        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname + '.pickle'):
            start = time.time()
            Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                          matrix_path,
                                          feature_file_path,
                                          phenotypes,
                                          label_mapper,
                                          selected_samples,
                                          p_value_threshold=p_value_threshold,
                                          remove_redundants=remove_redundants,
                                          num_p=self.num_p,
                                          blastn_path=self.blastn_path)
            end = time.time()
            spent = end - start
            DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
            FileUtility.save_obj(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname, Final_OBJ)
            print('\t✔ Marker analysis and alignment ' + phenoname + '  ' +
                  str(spent) + ' seconds, using ' + str(self.num_p) + 'cores')
            self.log_file.append('Marker analysis and alignment ' + phenoname +
                                 '  ' + str(spent) + ' seconds, using ' +
                                 str(self.num_p) + 'cores')
        else:
            Final_OBJ = FileUtility.load_obj(self.output_directory +
                                             'final_outputs/save_states/' +
                                             phenoname + '.pickle')
            print('\t✔ The aligned markers already existed and are loaded!')
            self.log_file.append(
                'The aligned markers already existed and are loaded!')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        # generating the tree
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                phenoname)

        if excel == 1:
            print('\t✔ Creating marker excel file..')
            Final_OBJ.generate_excel(
                self.output_directory + 'final_outputs/' + phenoname + '.xlsx',
                phenoname)
            X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '.npz'
            feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_features'
            markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt'
            Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_' + phenoname + "_Y.txt"
            print('\t✔ Creating t-sne plot..')
            DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' +
                                    phenoname + '_tsne.pdf',
                                    X_addr,
                                    feature_addr,
                                    markers,
                                    Y,
                                    labels=['Negative', 'Positive'])

        if pos_label and neg_label:
            print('\t✔ Creating marker heatmap..')
            Final_OBJ.update_matrix_by_markers_N()
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + phenoname +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
            if not excel == 1:
                print('\t✔ Creating t-sne plot..')
                DiTaxaWorkflow.plot_res(self.output_directory +
                                        'final_outputs/' + phenoname +
                                        '_tsne.pdf',
                                        X_addr,
                                        feature_addr,
                                        markers,
                                        Y,
                                        labels=[neg_label, pos_label])
        DiTaxaWorkflow.temp_cleanup()
        print(
            '\t⬛ Marker detection and analysis completed. You can find the results at '
            + self.output_directory +
            ', in partuclar at final_outputs subdirectory.')