示例#1
0
    def representation_creation_dir(
        inp_dir,
        out_dir,
        dataset_name,
        num_p,
        filetype='fastq',
        sampling_dict={
            3: [20],
            4: [100],
            5: [500],
            6: [100, 1000, 2000, 5000, 10000, -1],
            7: [5000],
            8: [8000]
        }):

        fasta_files, mapping = FileUtility.read_fasta_directory(
            inp_dir, filetype)

        for k in sampling_dict.keys():
            for N in sampling_dict[k]:
                print(k, '-mers with sampling size ', N)
                RS = Metagenomic16SRepresentation(fasta_files, mapping, N,
                                                  num_p)
                # path to save the generated files
                RS.generate_kmers_all(
                    k,
                    save=out_dir + '_'.join(
                        [dataset_name, str(k) + '-mers',
                         str(N)]))
示例#2
0
    def bootstrapping(inp_dir,
                      out_dir,
                      dataset_name,
                      filetype='fastq',
                      k_values=[3, 4, 5, 6, 7, 8],
                      sampling_sizes=[
                          10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000
                      ]):
        '''
        :param inp_dir:
        :param out_dir:
        :param filetype:
        :param k_values:
        :param sampling_sizes:
        :return:
        '''
        fasta_files, mapping = FileUtility.read_fasta_directory(
            inp_dir, filetype)
        BS = BootStrapping(fasta_files,
                           out_dir,
                           seqtype=filetype,
                           sampling_sizes=sampling_sizes,
                           n_resamples=10,
                           M=10)

        for k in k_values:
            print(k, '-mer bootstrapping started')
            BS.add_kmer_sampling(k)
            print(k, '-mer bootstrapping completed')

        BS.plotting('results_bootstrapping' + '_' + dataset_name, dataset_name)
 def __init__(self, file_directory, file_extenstion, onlyfiles=[]):
     '''
     :param file_directory: the samples directory
     :param file_extenstion: the file extension fastq or fasta
     :param onlyfiles: filter a list of files
     :param backend: which backend to use
     '''
     print('Segmentation training')
     self.file_directory = file_directory
     self.file_extenstion = file_extenstion
     self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory(
         self.file_directory, self.file_extenstion, only_files=onlyfiles)
     print(str(len(self.fasta_files)), 'fasta files found in',
           self.file_directory)
示例#4
0
    def __init__(self,
                 file_directory,
                 file_extenstion,
                 npe_file,
                 onlyfiles=[],
                 sampling_number=3000,
                 num_p=20,
                 vocab_size=-1):
        '''
        :param fasta_files: list of fasta files
        :param indexing: the index
        :param sampling_number:
        :param num_p:
        '''
        self.file_directory = file_directory
        self.file_extenstion = file_extenstion
        self.fasta_files, self.indexing = FileUtility.read_fasta_directory(
            self.file_directory, self.file_extenstion, only_files=onlyfiles)
        print(str(len(self.fasta_files)), 'fasta files found in',
              self.file_directory)

        self.num_p = num_p
        self.sampling_number = sampling_number
        self.npe_file = npe_file

        if '.model' in npe_file:
            self.model_type = 'seqpiece'
            self.npe_vocab = [
                x.split()[0] for x in FileUtility.load_list(
                    npe_file.replace('.model', '.vocab'))
            ]
        else:
            self.model_type = 'normal_bpe'
            self.npe_vocab = [
                ''.join(x.split()).replace('</w>', '').lower()
                for x in FileUtility.load_list(npe_file)[1::]
            ]
            self.npe_vocab = list(set(self.npe_vocab))
        self.vocab_size = vocab_size
        self.npe_vocab.sort()
        self.npe_vectorizer = TfidfVectorizer(use_idf=False,
                                              vocabulary=self.npe_vocab,
                                              analyzer='word',
                                              norm=None,
                                              stop_words=[],
                                              lowercase=True,
                                              binary=False,
                                              tokenizer=str.split)
示例#5
0
    def __init__(self,
                 file_directory,
                 file_extenstion,
                 output_directory,
                 dbname,
                 vocab_size,
                 seg_train_depth,
                 rep_sampling_depth,
                 blastn_path,
                 num_p=1,
                 onlyfiles=[],
                 override=1):
        '''
        :param file_directory: the samples directory
        :param file_extenstion: the file extension fastq or fasta
        :param onlyfiles: filter a list of files
        :param backend: which backend to use
        '''
        self.override = override
        self.file_directory = file_directory
        self.file_extenstion = file_extenstion
        self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory(
            self.file_directory, self.file_extenstion, only_files=onlyfiles)
        print(str(len(self.fasta_files)), ' fasta files found in',
              self.file_directory)

        self.dbname = dbname
        self.vocab_size = vocab_size
        self.seg_train_depth = seg_train_depth
        self.rep_sampling_depth = rep_sampling_depth
        self.num_p = num_p
        self.output_directory = output_directory
        self.output_directory_inter = (
            output_directory[0:-1] if output_directory[-1] == '/' else
            output_directory) + '/intermediate_files/'
        self.blastn_path = blastn_path

        DiTaxaWorkflow.ensure_dir(self.output_directory)
        if not os.path.exists(self.output_directory + 'logfile.txt'):
            self.log_file = []
        else:
            self.log_file = FileUtility.load_list(self.output_directory +
                                                  'logfile.txt')
        print('\t✔ DiTaxa workflow is getting started')
示例#6
0
    def __init__(self,
                 file_directory,
                 file_extenstion,
                 output_directory,
                 dbname,
                 vocab_size,
                 seg_train_depth,
                 rep_sampling_depth,
                 num_p=1,
                 onlyfiles=[]):
        '''
        :param file_directory: the samples directory
        :param file_extenstion: the file extension fastq or fasta
        :param onlyfiles: filter a list of files
        :param backend: which backend to use
        '''
        print('Segmentation training')
        self.file_directory = file_directory
        self.file_extenstion = file_extenstion
        self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory(
            self.file_directory, self.file_extenstion, only_files=onlyfiles)
        print(str(len(self.fasta_files)), 'fasta files found in',
              self.file_directory)

        self.dbname = dbname
        self.vocab_size = vocab_size
        self.seg_train_depth = seg_train_depth
        self.rep_sampling_depth = rep_sampling_depth
        self.num_p = num_p
        self.output_directory = output_directory

        DiTaxaWorkflow.ensure_dir(self.output_directory)
        if not os.path.exists(self.output_directory + 'logfile.txt'):
            self.log_file = []
        else:
            self.log_file = FileUtility.load_list(self.output_directory +
                                                  'logfile.txt')
        print('pipeline started')
示例#7
0
    @staticmethod
    def load_precalculated(file_path):
        '''
        load precalculated results
        :param file_path:
        :return:
        '''
        return FileUtility.load_obj(file_path)


if __name__ == '__main__':
    '''
        test-case
    '''
    files = FileUtility.recursive_glob(
        '/mounts/data/proj/asgari/github_repos/microbiomephenotype/data_config/bodysites/',
        '*.txt')
    list_of_files = []
    for file in files:
        list_of_files += FileUtility.load_list(file)
    list_of_files = [x + '.fsa' for x in list_of_files]
    fasta_files, mapping = FileUtility.read_fasta_directory(
        '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/hmb_data/',
        'fsa',
        only_files=list_of_files)
    BS = BootStrapping(fasta_files, 'body', seqtype='fsa', M=10)
    for k in [3, 4, 5, 6, 7, 8]:
        print(k)
        BS.add_kmer_sampling(k)