def representation_creation_dir( inp_dir, out_dir, dataset_name, num_p, filetype='fastq', sampling_dict={ 3: [20], 4: [100], 5: [500], 6: [100, 1000, 2000, 5000, 10000, -1], 7: [5000], 8: [8000] }): fasta_files, mapping = FileUtility.read_fasta_directory( inp_dir, filetype) for k in sampling_dict.keys(): for N in sampling_dict[k]: print(k, '-mers with sampling size ', N) RS = Metagenomic16SRepresentation(fasta_files, mapping, N, num_p) # path to save the generated files RS.generate_kmers_all( k, save=out_dir + '_'.join( [dataset_name, str(k) + '-mers', str(N)]))
def bootstrapping(inp_dir, out_dir, dataset_name, filetype='fastq', k_values=[3, 4, 5, 6, 7, 8], sampling_sizes=[ 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000 ]): ''' :param inp_dir: :param out_dir: :param filetype: :param k_values: :param sampling_sizes: :return: ''' fasta_files, mapping = FileUtility.read_fasta_directory( inp_dir, filetype) BS = BootStrapping(fasta_files, out_dir, seqtype=filetype, sampling_sizes=sampling_sizes, n_resamples=10, M=10) for k in k_values: print(k, '-mer bootstrapping started') BS.add_kmer_sampling(k) print(k, '-mer bootstrapping completed') BS.plotting('results_bootstrapping' + '_' + dataset_name, dataset_name)
def __init__(self, file_directory, file_extenstion, onlyfiles=[]): ''' :param file_directory: the samples directory :param file_extenstion: the file extension fastq or fasta :param onlyfiles: filter a list of files :param backend: which backend to use ''' print('Segmentation training') self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), 'fasta files found in', self.file_directory)
def __init__(self, file_directory, file_extenstion, npe_file, onlyfiles=[], sampling_number=3000, num_p=20, vocab_size=-1): ''' :param fasta_files: list of fasta files :param indexing: the index :param sampling_number: :param num_p: ''' self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.indexing = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), 'fasta files found in', self.file_directory) self.num_p = num_p self.sampling_number = sampling_number self.npe_file = npe_file if '.model' in npe_file: self.model_type = 'seqpiece' self.npe_vocab = [ x.split()[0] for x in FileUtility.load_list( npe_file.replace('.model', '.vocab')) ] else: self.model_type = 'normal_bpe' self.npe_vocab = [ ''.join(x.split()).replace('</w>', '').lower() for x in FileUtility.load_list(npe_file)[1::] ] self.npe_vocab = list(set(self.npe_vocab)) self.vocab_size = vocab_size self.npe_vocab.sort() self.npe_vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.npe_vocab, analyzer='word', norm=None, stop_words=[], lowercase=True, binary=False, tokenizer=str.split)
def __init__(self, file_directory, file_extenstion, output_directory, dbname, vocab_size, seg_train_depth, rep_sampling_depth, blastn_path, num_p=1, onlyfiles=[], override=1): ''' :param file_directory: the samples directory :param file_extenstion: the file extension fastq or fasta :param onlyfiles: filter a list of files :param backend: which backend to use ''' self.override = override self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), ' fasta files found in', self.file_directory) self.dbname = dbname self.vocab_size = vocab_size self.seg_train_depth = seg_train_depth self.rep_sampling_depth = rep_sampling_depth self.num_p = num_p self.output_directory = output_directory self.output_directory_inter = ( output_directory[0:-1] if output_directory[-1] == '/' else output_directory) + '/intermediate_files/' self.blastn_path = blastn_path DiTaxaWorkflow.ensure_dir(self.output_directory) if not os.path.exists(self.output_directory + 'logfile.txt'): self.log_file = [] else: self.log_file = FileUtility.load_list(self.output_directory + 'logfile.txt') print('\t✔ DiTaxa workflow is getting started')
def __init__(self, file_directory, file_extenstion, output_directory, dbname, vocab_size, seg_train_depth, rep_sampling_depth, num_p=1, onlyfiles=[]): ''' :param file_directory: the samples directory :param file_extenstion: the file extension fastq or fasta :param onlyfiles: filter a list of files :param backend: which backend to use ''' print('Segmentation training') self.file_directory = file_directory self.file_extenstion = file_extenstion self.fasta_files, self.filename_mapping = FileUtility.read_fasta_directory( self.file_directory, self.file_extenstion, only_files=onlyfiles) print(str(len(self.fasta_files)), 'fasta files found in', self.file_directory) self.dbname = dbname self.vocab_size = vocab_size self.seg_train_depth = seg_train_depth self.rep_sampling_depth = rep_sampling_depth self.num_p = num_p self.output_directory = output_directory DiTaxaWorkflow.ensure_dir(self.output_directory) if not os.path.exists(self.output_directory + 'logfile.txt'): self.log_file = [] else: self.log_file = FileUtility.load_list(self.output_directory + 'logfile.txt') print('pipeline started')
@staticmethod def load_precalculated(file_path): ''' load precalculated results :param file_path: :return: ''' return FileUtility.load_obj(file_path) if __name__ == '__main__': ''' test-case ''' files = FileUtility.recursive_glob( '/mounts/data/proj/asgari/github_repos/microbiomephenotype/data_config/bodysites/', '*.txt') list_of_files = [] for file in files: list_of_files += FileUtility.load_list(file) list_of_files = [x + '.fsa' for x in list_of_files] fasta_files, mapping = FileUtility.read_fasta_directory( '/mounts/data/proj/asgari/dissertation/datasets/deepbio/microbiome/hmb_data/', 'fsa', only_files=list_of_files) BS = BootStrapping(fasta_files, 'body', seqtype='fsa', M=10) for k in [3, 4, 5, 6, 7, 8]: print(k) BS.add_kmer_sampling(k)