예제 #1
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
def RA_healthy():
    Pipeline = DiTaxaWorkflow(
        '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/RA/',
        'fastq',
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/RAoutput/',
        'RA',
        50000,
        5000,
        -1,
        num_p=20)
    #Pipeline.train_npe()
    #Pipeline.representation_npe()
    labels = FileUtility.load_list(
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/labels.txt'
    )
    labels = {
        x.split('/')[-1]: labels[idx]
        for idx, x in enumerate(
            FileUtility.load_list(
                '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/ra/rep/ra_selfposnpe_10000_npe_5000_meta'
            ))
    }
    Pipeline.biomarker_extraction(labels, {
        'untreated_RA': 1,
        'treated_RA': 0
    }, 'untreated_vs_treated')
예제 #2
0
    def __init__(self, pos_fasta, neg_fasta, output_path, segmentation_schemes=10, topN=100):
        '''

        '''
        if not isinstance(pos_fasta, str):
            self.pos=pos_fasta
        elif pos_fasta.split('.')[-1]=='txt':
            self.pos=FileUtility.load_list(pos_fasta)
        elif pos_fasta.split('.')[-1]=='fasta':
            self.pos=FileUtility.read_fasta_sequences(pos_fasta)
        if not isinstance(neg_fasta, str):
            self.neg=neg_fasta
        elif neg_fasta.split('.')[-1]=='txt':
            self.neg=FileUtility.load_list(neg_fasta)
        elif neg_fasta.split('.')[-1]=='fasta':
            self.neg=FileUtility.read_fasta_sequences(neg_fasta)
        self.seqs=[seq.lower() for seq in self.pos+self.neg]
        self.labels=[1]*len(self.pos)+[0]*len(self.neg)
        self.segmentation_schemes=segmentation_schemes
        self.load_alpha_distribution()
        self.prepare_segmentations()
        print (output_path)
        FileUtility.ensure_dir(output_path)
        self.output_path=output_path
        self.motif_extraction(topN)
예제 #3
0
 def __init__(self,
              fasta_file,
              matrix_path,
              feature_file_path,
              phenotypes,
              phenotype_mapping,
              selected_samples,
              p_value_threshold=0.01,
              remove_redundants=False,
              num_p=4):
     self.num_p = num_p
     self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file)
     self.remove_redundants = remove_redundants
     self.ez_taxa_dict = {
         x.split()[0]: x.split()[1].split(';')
         for x in FileUtility.load_list('../db/ez_idx_taxonomy.txt')
     }
     self.mat = FileUtility.load_sparse_csr(matrix_path)
     self.mat = self.mat.toarray()
     self.mat = self.mat[selected_samples, :]
     self.mat = csr_matrix(self.mat)
     self.features = FileUtility.load_list(feature_file_path)
     self.align_markers_parallel(p_value_threshold)
     self.redundant_columns_indentification()
     self.phenotype_mapping = phenotype_mapping
     self.phenotypes = phenotypes
 def __init__(self, X_file, Y_file, features_file, path, selected_samples):
     '''
     :param X:
     :param Y:
     :param features:
     :param path:
     '''
     self.X = FileUtility.load_sparse_csr(X_file)
     self.X = self.X.toarray()
     self.X = self.X[selected_samples, :]
     self.X = csr_matrix(self.X)
     self.Y = [int(x) for x in FileUtility.load_list(Y_file)]
     self.features = FileUtility.load_list(features_file)
     self.path = path
예제 #5
0
    def __init__(self, X, Y, isolate_list, fold_file, test_file):
        '''
        :param X:
        :param Y:
        :param folds:
        :param random_state:
        '''
        CrossValidator.__init__(self, X, Y)

        map_to_idx = {isolate: idx for idx, isolate in enumerate(isolate_list)}

        test_idx = [
            map_to_idx[test]
            for test in FileUtility.load_list(test_file)[0].split()
            if test in map_to_idx
        ]

        train_idx = [
            map_to_idx[train] for train in list(
                itertools.chain(
                    *[l.split() for l in FileUtility.load_list(fold_file)]))
        ]

        self.X_test = X[test_idx, :]
        self.Y_test = [Y[idy] for idy in test_idx]

        train_idx = list(set(map_to_idx.values()) - set(test_idx))

        X = X[train_idx, :]
        Y = [Y[idy] for idy in train_idx]
        isolate_list = [isolate_list[idx] for idx in train_idx]

        self.train_isolate_list = isolate_list
        map_to_idx = {isolate: idx for idx, isolate in enumerate(isolate_list)}

        splits = [[
            map_to_idx[item] for item in fold_list.split()
            if item in map_to_idx
        ] for fold_list in FileUtility.load_list(fold_file)]

        new_splits = []
        for i in range(len(splits)):
            train = [j for i in splits[:i] + splits[i + 1:] for j in i]
            test = splits[i]
            new_splits.append([train, test])

        self.cv = new_splits
        self.X = X
        self.Y = Y
예제 #6
0
 def load_data(self, dir, prefix_list):
     '''
     Load list of features
     :param dir:
     :param prefix_list:
     :return:
     '''
     for save_pref in prefix_list:
         print('@@@' + '_'.join([dir + save_pref, 'feature', 'vect.npz']))
         self.X[save_pref] = FileUtility.load_sparse_csr('_'.join(
             [dir + save_pref, 'feature', 'vect.npz']))
         self.feature_names[save_pref] = FileUtility.load_list('_'.join(
             [dir + save_pref, 'feature', 'list.txt']))
         self.isolates[save_pref] = FileUtility.load_list('_'.join(
             [dir + save_pref, 'isolates', 'list.txt']))
예제 #7
0
 def load_data(self, prefix_list=None):
     '''
     Load list of features
     :param dir:
     :param prefix_list:
     :return:
     '''
     for save_pref in prefix_list:
         print('@@@' + '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz']))
         self.X[save_pref] = FileUtility.load_sparse_csr(
             '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz']))
         self.feature_names[save_pref] = FileUtility.load_list(
             '_'.join([self.representation_path + save_pref, 'feature', 'list.txt']))
         self.strains[save_pref] = FileUtility.load_list(
             '_'.join([self.representation_path + save_pref, 'strains', 'list.txt']))
예제 #8
0
    def __init__(self,
                 file_directory,
                 file_extenstion,
                 npe_file,
                 onlyfiles=[],
                 sampling_number=3000,
                 num_p=20,
                 vocab_size=-1):
        '''
        :param fasta_files: list of fasta files
        :param indexing: the index
        :param sampling_number:
        :param num_p:
        '''
        self.file_directory = file_directory
        self.file_extenstion = file_extenstion
        self.fasta_files, self.indexing = FileUtility.read_fasta_directory(
            self.file_directory, self.file_extenstion, only_files=onlyfiles)
        print(str(len(self.fasta_files)), 'fasta files found in',
              self.file_directory)

        self.num_p = num_p
        self.sampling_number = sampling_number
        self.npe_file = npe_file

        if '.model' in npe_file:
            self.model_type = 'seqpiece'
            self.npe_vocab = [
                x.split()[0] for x in FileUtility.load_list(
                    npe_file.replace('.model', '.vocab'))
            ]
        else:
            self.model_type = 'normal_bpe'
            self.npe_vocab = [
                ''.join(x.split()).replace('</w>', '').lower()
                for x in FileUtility.load_list(npe_file)[1::]
            ]
            self.npe_vocab = list(set(self.npe_vocab))
        self.vocab_size = vocab_size
        self.npe_vocab.sort()
        self.npe_vectorizer = TfidfVectorizer(use_idf=False,
                                              vocabulary=self.npe_vocab,
                                              analyzer='word',
                                              norm=None,
                                              stop_words=[],
                                              lowercase=True,
                                              binary=False,
                                              tokenizer=str.split)
예제 #9
0
    def make_labels(self, mapping=None):
        '''
            This function load labels mapping from strain to phenotypes
        '''
        label_file_address = self.metadata_path + 'phenotypes.txt'
        rows = FileUtility.load_list(label_file_address)
        self.strain2labelvector = {
            str(entry.split()[0]): [str(x) for idx, x in enumerate(entry.split('\t')[1::])] for entry in rows[1::]}
        self.labeled_strains = list(self.strain2labelvector)
        self.labeled_strains.sort()

        self.phenotypes = [x for x in rows[0].rstrip().split()[1::]]
        # init
        for phenotype in self.phenotypes:
            self.phenotype2labeled_strains_mapping[phenotype] = []

        # only consider non-empty values
        for strain, phenotype_vec in self.strain2labelvector.items():
            for idx, val in enumerate(phenotype_vec):
                if mapping:
                    if val in mapping:
                        self.phenotype2labeled_strains_mapping[self.phenotypes[idx]].append((strain, mapping[val]))
                else:
                    self.phenotype2labeled_strains_mapping[self.phenotypes[idx]].append((strain, val))
        # generate dict of labels for each class
        for phenotype in self.phenotypes:
            self.phenotype2labeled_strains_mapping[phenotype] = dict(self.phenotype2labeled_strains_mapping[phenotype])
예제 #10
0
def org_classification():
    '''
    '''
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64])
    DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
예제 #11
0
 def create_report_png(self):
     report = {
         'language_iso': [],
         'trans_ID': [],
         'language_name': [],
         'verses': []
     }
     self.df_png = pd.DataFrame(report)
     png_files = FileUtility.recursive_glob(self.output_path + '/',
                                            '*.png.txt')
     for png_file in png_files:
         iso, code = png_file.split('/')[-1].split(
             '.')[0:-1][0:-1][-1].split('_')
         length = len(FileUtility.load_list(png_file))
         lang_name = self.lang_dict[
             iso] if iso in self.lang_dict else 'ISO: ' + iso
         self.df_png = self.df_png.append(
             {
                 'language_iso': iso,
                 'trans_ID': code,
                 'language_name': lang_name,
                 'verses': length
             },
             ignore_index=True)
     self.df_png.set_index('trans_ID')
     self.df_png.to_csv(
         self.output_path + '/reports/crawl_report_png.tsv',
         sep='\t',
         index=False,
         columns=['language_iso', 'trans_ID', 'language_name', 'verses'])
     self.generate_final_rep()
예제 #12
0
 def create_report_cloud(self):
     report = {
         'language_iso': [],
         'trans_ID': [],
         'language_name': [],
         'Description': [],
         'verses': []
     }
     for trID in self.df_cloud.trans_ID:
         iso = self.id2iso_dict[trID]
         if not FileUtility.exists(self.output_path + '/' + iso + '_' +
                                   trID + '.cloud.txt'):
             length = 0
         else:
             length = len(
                 FileUtility.load_list(self.output_path + '/' + iso + '_' +
                                       trID + '.cloud.txt'))
             report['language_iso'].append(iso)
             report['trans_ID'].append(trID)
             report['language_name'].append(self.id2lang_dict[trID])
             report['Description'].append(self.id2version[trID])
             report['verses'].append(length)
     report = pd.DataFrame(report)
     report.set_index('trans_ID')
     report.to_csv(self.output_path + '/reports/crawl_report_cloud.tsv',
                   sep='\t',
                   index=False,
                   columns=[
                       'language_iso', 'trans_ID', 'language_name',
                       'Description', 'verses'
                   ])
     self.generate_final_rep()
예제 #13
0
def train_batch_generator_408(batch_size=64):
    '''
    :param batch_size:
    :return:
    '''
    start_idx = 0
    train_lengths = [int(j) for j in FileUtility.load_list(
        'datasets/train_length.txt')]
    X_train = np.load('datasets/X_train_408.npy')
    Y_train = np.array(
        np.load('datasets/train_mat_Y.npy'))
    while True:
        if not start_idx < len(train_lengths):
            start_idx = 0
        X = X_train[start_idx:(min(start_idx + batch_size, len(train_lengths))),
            0:train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1]]
        Y = Y_train[start_idx:(min(start_idx + batch_size, len(train_lengths))),
            0:train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1], :]

        W = []
        for idx in range(start_idx, (min(start_idx + batch_size, len(train_lengths)))):
            W.append([1 if l < train_lengths[idx] else 0 for l in
                      range(0, train_lengths[min(start_idx + batch_size, len(train_lengths)) - 1])])

        start_idx += batch_size

        yield X, Y, np.array(W)
예제 #14
0
    def create_report_biblecom(self):
        self.df_biblecom['verses'] = 0

        biblecom_files = FileUtility.recursive_glob(self.output_path + '/',
                                                    '*.biblecom.txt')
        for bib_file in biblecom_files:
            file_parts = bib_file.split('/')[-1].split(
                '.')[0:-1][0:-1][-1].split('_')
            num_file_parts = len(file_parts)
            if num_file_parts == 2:
                iso, code = file_parts
            elif num_file_parts == 3:
                iso = "_".join(file_parts[:2])
                code = file_parts[2]
            else:
                continue
            length = len(FileUtility.load_list(bib_file))
            self.df_biblecom.loc[:, 'verses'][
                (self.df_biblecom['language_iso'] == iso)
                & (self.df_biblecom['trans_ID'] == int(code))] = length
        self.df_biblecom.set_index('trans_ID')
        self.df_biblecom.to_csv(
            self.output_path + '/reports/crawl_report_biblecom.tsv',
            sep='\t',
            index=False,
            columns=['language_iso', 'trans_ID', 'language_name', 'verses'])
        self.generate_final_rep()
예제 #15
0
def validation_batch_generator_408(batch_size=100):
    '''
    :param batch_size:
    :return:
    '''
    test_lengths = [int(i) for i in FileUtility.load_list(
        'datasets/test_length.txt')]
    X_test = np.load('datasets/X_test_408.npy')
    Y_test = np.array(
        np.load('datasets/test_mat_Y.npy'))
    start_idx = 0
    while True:
        if not start_idx < len(test_lengths):
            start_idx = 0
        X = X_test[start_idx:(min(start_idx + batch_size, len(test_lengths))),
            0:test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1]]
        Y = Y_test[start_idx:(min(start_idx + batch_size, len(test_lengths))),
            0:test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1], :]
        W = []
        for idx in range(start_idx, (min(start_idx + batch_size, len(test_lengths)))):
            W.append([1 if l < test_lengths[idx] else 0 for l in
                      range(0, test_lengths[min(start_idx + batch_size, len(test_lengths)) - 1])])

        start_idx += batch_size
        yield X, Y, np.array(W)
예제 #16
0
    def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None):

        ## find a mapping from strains to the phenotypes
        if mapping:
            mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype])
        else:
            mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype]

        # get common strains
        list_of_list_of_strains = list(self.strains.values())
        list_of_list_of_strains.append(list(mapping_isolate_label.keys()))
        final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains)
        final_strains.sort()

        # prepare test
        Y = [mapping_isolate_label[strain] for strain in final_strains]

        isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))])

        groups=[int(isolate_to_group[iso]) for iso in final_strains]
        group_kfold = GroupKFold(n_splits=round(1/test_ratio))

        train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0]
        X_test=[final_strains[x] for x in test_index]
        FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)])
        final_strains = [final_strains[ix] for ix in train_index]
        group_kfold = GroupKFold(n_splits=cv)

        folds=[]
        for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index],  [groups[idx] for idx in train_index]):
            folds.append(test_index)
        folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in  folds]
        FileUtility.save_list(path, folds)
예제 #17
0
def eco_all_classification_transfer_learning():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
예제 #18
0
def eco_all_classification():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
예제 #19
0
def crohns_disease():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16])
    DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
예제 #20
0
 def load_book_map(self):
     '''
         loading book number mapping
     '''
     self.book_map = dict()
     for l in FileUtility.load_list('../meta/books2numbers.txt'):
         for y in l.split('\t')[1].split(','):
             self.book_map[y] = l.split('\t')[0]
예제 #21
0
def generate_top_features(path, classifier_list, topk=200):
    ## TODO: ask as an input topk

    writer = pd.ExcelWriter(path + '/ultimate_outputs/selected_features.xls',
                            engine='xlsxwriter')

    final_results = dict()
    for classifier in classifier_list:
        feature_files = FileUtility.recursive_glob(
            path + '/feature_selection/', '*_' + classifier)
        res = dict()
        for file in feature_files:
            phenotype = file.split('/')[0:-1][-1]
            if not phenotype in res:
                res[phenotype] = [file]
            else:
                if file.split('/')[-1].count('##') > res[phenotype][0].split(
                        '/')[-1].count('##'):
                    res[phenotype] = [file]
                elif file.split('/')[-1].count(
                        '##') == res[phenotype][0].split('/')[-1].count('##'):
                    res[phenotype].append(file)
        for phenotype in res.keys():
            if phenotype not in final_results:
                final_results[phenotype] = []
            final_results[phenotype] += res[phenotype]
    for phenotype, files in final_results.items():
        selected = [{
            x.split('\t')[0]: 1 / (idx + 1)
            for idx, x in enumerate(FileUtility.load_list(file)[1:topk])
        } for file in files]
        res = set(selected[0])
        for set_select in selected[1::]:
            res = res.intersection(set_select)

        geno_val_res = dict()
        for dict_geno_val in selected:
            for x, val in dict_geno_val.items():
                if x not in geno_val_res:
                    geno_val_res[x] = [val, 1]
                else:
                    geno_val_res[x][0] += val
                    geno_val_res[x][1] += 1

        df_dict = {'feature_name': [], 'mrr': [], 'freq_confirmation': []}
        for name, values in geno_val_res.items():
            rr, nr = values
            df_dict['feature_name'].append(name)
            df_dict['mrr'].append(rr / nr)
            df_dict['freq_confirmation'].append(nr)
        df = pd.DataFrame(df_dict)
        df.sort_values(['freq_confirmation', 'mrr', 'feature_name'],
                       ascending=[False, False, False],
                       inplace=True)
        df = df.copy()
        df.to_excel(writer, sheet_name=phenotype, index=False)
예제 #22
0
def test():
    X = FileUtility.load_sparse_csr(
        '../body-sites/npe_rate_5000.npz').toarray()
    Y = FileUtility.load_list(
        '../body-sites/npe_representations_labels/labels_phen.txt')
    DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64])
    DNN.cross_validation('../body-sites/nn',
                         gpu_dev='2',
                         n_fold=3,
                         epochs=300,
                         batch_size=10,
                         model_strct='mlp')
예제 #23
0
 def sequence_lengths(input_file):
     train = FileUtility.load_list(input_file)
     training_data = [line.split() for line in train]
     final_list = list()
     temp = []
     for x in training_data:
         if x == []:
             final_list.append(temp)
             temp = []
         else:
             temp.append(x)
     return [len(prot) for prot in final_list]
예제 #24
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
 def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs,
                    batch_size):
     # k-mer data
     X = FileUtility.load_sparse_csr(X_file).toarray()
     # labels
     Y = [int(y) for y in FileUtility.load_list(Y_file)]
     DeepNN = DNN(X, Y, model_arch=arch)
     DeepNN.cross_validation(out_dir,
                             gpu_dev=gpu_id,
                             n_fold=10,
                             epochs=epochs,
                             batch_size=batch_size,
                             model_strct='mlp')
예제 #25
0
 def DNN_classifier(X_file, Y_file, arch, out_dir, dataset_name, gpu_id,
                    epochs, batch_size):
     # k-mer data
     X = FileUtility.load_sparse_csr(X_file).toarray()
     # labels
     Y = FileUtility.load_list(Y_file)
     DNN = DNNMutliclass16S(X, Y, model_arch=arch)
     DNN.cross_validation(out_dir + 'nn_classification_results_' +
                          dataset_name,
                          gpu_dev=gpu_id,
                          n_fold=10,
                          epochs=epochs,
                          batch_size=batch_size,
                          model_strct='mlp')
예제 #26
0
 def jump_url(self):
     '''
     :return:
     '''
     while self.counter < 1188:
         self.counter+=1
         url_select='/'.join(self.url.split('/')[0:-1])+'/'+FileUtility.load_list('../meta/pngscript_filenames.txt')[self.counter]
         if url_select not in self.seen and url_select not in self.useless_url:
             if requests.get(url_select).status_code==404:
                 if requests.get('/'.join(self.url.split('/')[0:-1])).status_code==404:
                     self.counter=1189
                     return None
                 self.useless_url.add(url_select)
             else:
                 url=url_select
                 self.useless_url.add(url)
                 return url
     return None
예제 #27
0
 def convert_to_kmer(input_file, out_file, n=3):
     train = FileUtility.load_list(input_file)
     training_data = [line.split() for line in train]
     final_list = list()
     temp = []
     for x in training_data:
         if x == []:
             final_list.append(temp)
             temp = []
         else:
             temp.append(x)
     res = []
     for prot in final_list:
         sentence = ''.join(['$'] + [aa[0] for aa in prot] + ['#'])
         res += [(sentence[i:i + n], prot[i][1])
                 for i in range(len(sentence) - n + 1)]
         res += ['']
     FileUtility.save_list(out_file, [' '.join(list(x)) for x in res])
예제 #28
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
    def __init__(self,
                 file_directory,
                 file_extenstion,
                 output_directory,
                 dbname,
                 vocab_size,
                 seg_train_depth,
                 rep_sampling_depth,
                 blastn_path,
                 num_p=1,
                 onlyfiles=[],
                 override=1):
        '''
        :param file_directory: the samples directory
        :param file_extenstion: the file extension fastq or fasta
        :param onlyfiles: filter a list of files
        :param backend: which backend to use
        '''
        self.override = override
        self.file_directory = file_directory
        self.file_extenstion = file_extenstion
        self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory(
            self.file_directory, self.file_extenstion, only_files=onlyfiles)
        print(str(len(self.fasta_files)), ' fasta files found in',
              self.file_directory)

        self.dbname = dbname
        self.vocab_size = vocab_size
        self.seg_train_depth = seg_train_depth
        self.rep_sampling_depth = rep_sampling_depth
        self.num_p = num_p
        self.output_directory = output_directory
        self.output_directory_inter = (
            output_directory[0:-1] if output_directory[-1] == '/' else
            output_directory) + '/intermediate_files/'
        self.blastn_path = blastn_path

        DiTaxaWorkflow.ensure_dir(self.output_directory)
        if not os.path.exists(self.output_directory + 'logfile.txt'):
            self.log_file = []
        else:
            self.log_file = FileUtility.load_list(self.output_directory +
                                                  'logfile.txt')
        print('\t✔ DiTaxa workflow is getting started')
예제 #29
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
def IBD():
    Pipeline = DiTaxaWorkflow(
        '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/crohn/',
        'fastq',
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/IBDout/',
        'IBD',
        50000,
        5000,
        -1,
        num_p=20)
    Pipeline.train_npe()
    Pipeline.representation_npe()
    labels = dict([(x.split()[0] + '.fastq', x.split(
    )[1]) for x in FileUtility.load_list(
        '/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/crohns/rep/Crohns_lables.txt'
    )])
    Pipeline.biomarker_extraction(labels, {
        'CD': 1,
        'no': 0,
        'control': 0
    }, 'CD_vs_healthy')
예제 #30
0
파일: DiTaxa.py 프로젝트: seedpcseed/DiTaxa
    def classical_classifier(out_dir, X_file, Y_file, model, cores):
        #
        X = FileUtility.load_sparse_csr(X_file)
        # labels
        Y = [int(y) for y in FileUtility.load_list(Y_file)]

        if model == 'RF':
            #### Random Forest classifier
            MRF = RFClassifier(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MRF.tune_and_eval(out_dir, njobs=cores)
        elif model == 'SVM':
            #### Support Vector Machine classifier
            MSVM = SVM(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MSVM.tune_and_eval(out_dir, njobs=cores)
        elif model == 'LR':
            #### Logistic regression classifier
            MLR = LogRegression(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MLR.tune_and_eval(out_dir, njobs=cores)