def test_reservoir_invalid_input(): genome_file = 'data/ecoli.fasta' record_list = ['NC_002695.1'] n = 4 with open(genome_file, 'r') as f: fasta_file = SeqIO.parse(f, 'fasta') for record in util.reservoir(fasta_file, record_list, n): pass
def test_reservoir(): samples = [] genome_file = 'data/genomes.fasta' with open(genome_file, 'r') as f: record_list = util.count_records(f) n = 2 with open(genome_file, 'r') as f: fasta_file = SeqIO.parse(f, 'fasta') for record in util.reservoir(fasta_file, record_list, n): samples.append(record.id) assert len(samples) == 2
def generate_reads(args): """Main function for the `iss generate` submodule This submodule generates reads from an ErrorModel and write them to args.output + _R(1|2).fastq Args: args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) logger.debug('iss version %s' % __version__) logger.debug('Using verbose logger') try: # try to import and load the correct error model logger.info('Starting iss generate') logger.info('Using %s ErrorModel' % args.mode) if args.seed: logger.info('Setting random seed to %i' % args.seed) random.seed(args.seed) np.random.seed(args.seed) if args.mode == 'kde': from iss.error_models import kde if args.model is None: logger.error('--model is required in --mode kde') sys.exit(1) elif args.model.lower() == 'hiseq': npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq') elif args.model.lower() == 'novaseq': npz = os.path.join(os.path.dirname(__file__), 'profiles/NovaSeq') elif args.model.lower() == 'miseq': npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq') else: npz = args.model err_mod = kde.KDErrorModel(npz) elif args.mode == 'basic': if args.model is not None: logger.warning('--model %s will be ignored in --mode %s' % (args.model, args.mode)) from iss.error_models import basic err_mod = basic.BasicErrorModel() elif args.mode == 'perfect': if args.model is not None: logger.warning('--model %s will be ignored in --mode %s' % (args.model, args.mode)) from iss.error_models import perfect err_mod = perfect.PerfectErrorModel() except ImportError as e: logger.error('Failed to import ErrorModel module: %s' % e) sys.exit(1) try: # try to read genomes and concatenate --genomes and --ncbi genomes if args.genomes or args.draft or args.ncbi: genome_files = [] if args.genomes: genome_files.extend(args.genomes) if args.draft: logger.warning('--draft is in early experimental stage.') logger.warning( 'disabling --abundance_file, --coverage and --n_genomes') logger.warning('Defaulting to --abundance.') genome_files.extend(args.draft) if args.ncbi and args.n_genomes_ncbi: util.genome_file_exists(args.output + '_ncbi_genomes.fasta') total_genomes_ncbi = [] try: assert len(*args.ncbi) == len(*args.n_genomes_ncbi) except AssertionError as e: logger.error( '--ncbi and --n_genomes_ncbi of unequal lengths. \ Aborting') sys.exit(1) # this is py3 only # for g, n in zip(*args.ncbi, *args.n_genomes): # py2 compatibilty workaround # TODO switch to the more elegant solution when we drop python2 args.ncbi = [x for y in args.ncbi for x in y] args.n_genomes_ncbi = [ x for y in args.n_genomes_ncbi for x in y ] for g, n in zip(args.ncbi, args.n_genomes_ncbi): genomes_ncbi = download.ncbi( g, n, args.output + '_ncbi_genomes.fasta') genome_files.append(genomes_ncbi) if args.ncbi and not args.n_genomes_ncbi: logger.error( '--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.') sys.exit(1) else: logger.error("One of --genomes/-g, --draft, --ncbi/-k is required") sys.exit(1) genome_file = args.output + '.iss.tmp.genomes.fasta' util.concatenate(genome_files, output=genome_file) # for n_genomes we use reservoir sampling to draw random genomes # from the concatenated genome file. We then override the file. if args.n_genomes and not args.draft and not args.ncbi: genome_count = util.count_records(genome_file) genome_files = [ genome for genome in util.reservoir(SeqIO.parse(genome_file, 'fasta'), genome_count, args.n_genomes) ] SeqIO.write(genome_files, genome_file, 'fasta') assert os.stat(genome_file).st_size != 0 f = open(genome_file, 'r') with f: # count the number of records genome_list = util.count_records(f) except IOError as e: logger.error('Failed to open genome(s) file:%s' % e) sys.exit(1) except AssertionError as e: logger.error('Genome(s) file seems empty: %s' % genome_file) sys.exit(1) except KeyboardInterrupt as e: logger.error('iss generate interrupted: %s' % e) sys.exit(1) else: abundance_dispatch = { 'uniform': abundance.uniform, 'halfnormal': abundance.halfnormal, 'exponential': abundance.exponential, 'lognormal': abundance.lognormal, 'zero_inflated_lognormal': abundance.zero_inflated_lognormal } # read the abundance file if args.abundance_file and not args.draft: logger.info('Using abundance file:%s' % args.abundance_file) abundance_dic = abundance.parse_abundance_file(args.abundance_file) elif args.coverage and not args.draft: logger.warning('--coverage is an experimental feature') logger.info('Using coverage file:%s' % args.coverage) abundance_dic = abundance.parse_abundance_file(args.coverage) elif args.abundance in abundance_dispatch: logger.info('Using %s abundance distribution' % args.abundance) if args.draft: abundance_dic = abundance.draft( genome_list, args.draft, abundance_dispatch[args.abundance], args.output) else: abundance_dic = abundance_dispatch[args.abundance](genome_list) abundance.to_file(abundance_dic, args.output) else: logger.error('Could not get abundance') sys.exit(1) cpus = args.cpus logger.info('Using %s cpus for read generation' % cpus) if not args.coverage: n_reads = util.convert_n_reads(args.n_reads) logger.info('Generating %s reads' % n_reads) try: temp_file_list = [] # list holding the prefix of all temp files f = open(genome_file, 'r') # re-opens the file with f: fasta_file = SeqIO.parse(f, 'fasta') for record in fasta_file: # generate reads for records try: species_abundance = abundance_dic[record.id] except KeyError as e: logger.error( 'Fasta record not found in abundance file: %s' % e) sys.exit(1) else: logger.info('Generating reads for record: %s' % record.id) genome_size = len(record.seq) if args.coverage: coverage = species_abundance else: coverage = abundance.to_coverage( n_reads, species_abundance, err_mod.read_length, genome_size) n_pairs = int( round((coverage * len(record.seq)) / err_mod.read_length) / 2) # skip record if n_reads == 0 if n_pairs == 0: continue # exact n_reads for each cpus if n_pairs % cpus == 0: n_pairs_per_cpu = [(n_pairs // cpus) for _ in range(cpus)] else: n_pairs_per_cpu = [(n_pairs // cpus) for _ in range(cpus)] n_pairs_per_cpu[-1] += n_pairs % cpus # due to a bug in multiprocessing # https://bugs.python.org/issue17560 # we can't send records taking more than 2**31 bytes # through serialisation. # In those cases we use memmapping if sys.getsizeof(str(record.seq)) >= 2**31 - 1: logger.warning("record %s unusually big." % record.id) logger.warning("Using a memory map.") mode = "memmap" record_mmap = "%s.memmap" % args.output if os.path.exists(record_mmap): os.unlink(record_mmap) util.dump(record, record_mmap) del record record = record_mmap gc.collect() else: mode = "default" record_file_name_list = Parallel(n_jobs=cpus)( delayed(generator.reads)( record, err_mod, n_pairs_per_cpu[i], i, args.output, args.seed, args.gc_bias, mode) for i in range(cpus)) temp_file_list.extend(record_file_name_list) except KeyboardInterrupt as e: logger.error('iss generate interrupted: %s' % e) temp_file_unique = list(set(temp_file_list)) temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] full_tmp_list = temp_R1 + temp_R2 full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): full_tmp_list.append("%s.memmap" % args.output) util.cleanup(full_tmp_list) sys.exit(1) else: # remove the duplicates in file list and cleanup # we remove the duplicates in case two records had the same header # and reads were appended to the same temp file. temp_file_unique = list(set(temp_file_list)) temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list] temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list] util.concatenate(temp_R1, args.output + '_R1.fastq') util.concatenate(temp_R2, args.output + '_R2.fastq') full_tmp_list = temp_R1 + temp_R2 full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): full_tmp_list.append("%s.memmap" % args.output) util.cleanup(full_tmp_list) if args.compress: util.compress(args.output + '_R1.fastq') util.compress(args.output + '_R2.fastq') logger.info('Read generation complete')
def generate_reads(args): """Main function for the `iss generate` submodule This submodule generates reads from an ErrorModel and write them to args.output + _R(1|2).fastq Args: args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) logger.debug('iss version %s' % __version__) logger.debug('Using verbose logger') try: # try to import and load the correct error model logger.info('Starting iss generate') logger.info('Using %s ErrorModel' % args.mode) if args.mode == 'kde': from iss.error_models import kde if args.model.lower() == 'hiseq': npz = os.path.join( os.path.dirname(__file__), 'profiles/HiSeq') elif args.model.lower() == 'novaseq': npz = os.path.join( os.path.dirname(__file__), 'profiles/NovaSeq') elif args.model.lower() == 'miseq': npz = os.path.join( os.path.dirname(__file__), 'profiles/MiSeq') elif args.model is None: logger.error('--model is required in --mode kde') sys.exit(1) else: npz = args.model err_mod = kde.KDErrorModel(npz) elif args.mode == 'basic': if args.model is not None: logger.warning('--model %s will be ignored in --mode %s' % ( args.model, args.mode)) from iss.error_models import basic err_mod = basic.BasicErrorModel() except ImportError as e: logger.error('Failed to import ErrorModel module: %s' % e) sys.exit(1) try: # try to read genomes and generate reads if args.genomes: genome_file = args.genomes elif args.ncbi and args.n_genomes: util.genome_file_exists(args.output + '_genomes.fasta') total_genomes = [] try: assert len(*args.ncbi) == len(*args.n_genomes) except AssertionError as e: logger.error( '--ncbi and --n_genomes of unequal lengths. Aborting') sys.exit(1) # for g, n in zip(*args.ncbi, *args.n_genomes): # this is py3 only # py2 compatibilty workaround # TODO remove when we drop python2 args.ncbi = [x for y in args.ncbi for x in y] args.n_genomes = [x for y in args.n_genomes for x in y] for g, n in zip(args.ncbi, args.n_genomes): genomes = download.ncbi(g, n) total_genomes.extend(genomes) genome_file = download.to_fasta(total_genomes, args.output) else: logger.error('Invalid input') # TODO better error handling here sys.exit(1) assert os.stat(genome_file).st_size != 0 f = open(genome_file, 'r') with f: # count the number of records record_list = util.count_records(f) except IOError as e: logger.error('Failed to open genome(s) file:%s' % e) sys.exit(1) except AssertionError as e: logger.error('Genome(s) file seems empty: %s' % genome_file) sys.exit(1) except KeyboardInterrupt as e: logger.error('iss generate interrupted: %s' % e) sys.exit(1) else: abundance_dispatch = { 'uniform': abundance.uniform, 'halfnormal': abundance.halfnormal, 'exponential': abundance.exponential, 'lognormal': abundance.lognormal, 'zero_inflated_lognormal': abundance.zero_inflated_lognormal } # read the abundance file if args.abundance_file: logger.info('Using abundance file:%s' % args.abundance_file) abundance_dic = abundance.parse_abundance_file(args.abundance_file) elif args.abundance in abundance_dispatch: logger.info('Using %s abundance distribution' % args.abundance) abundance_dic = abundance_dispatch[args.abundance](record_list) abundance.to_file(abundance_dic, args.output) else: logger.error('Could not get abundance') sys.exit(1) cpus = args.cpus logger.info('Using %s cpus for read generation' % cpus) n_reads = util.convert_n_reads(args.n_reads) logger.info('Generating %s reads' % n_reads) try: temp_file_list = [] # list holding the prefix of all temp files f = open(genome_file, 'r') # re-opens the file with f: fasta_file = SeqIO.parse(f, 'fasta') if args.n_genomes and not args.ncbi: n = args.n_genomes[0][0] else: n = None for record in util.reservoir(fasta_file, record_list, n): # generate reads for records try: species_abundance = abundance_dic[record.id] except KeyError as e: logger.error( 'Fasta record not found in abundance file: %s' % e) sys.exit(1) else: logger.info('Generating reads for record: %s' % record.id) genome_size = len(record.seq) coverage = abundance.to_coverage( n_reads, species_abundance, err_mod.read_length, genome_size ) n_pairs = int(round( (coverage * len(record.seq)) / err_mod.read_length) / 2) # good enough approximation n_pairs_per_cpu = int(round(n_pairs / cpus)) record_file_name_list = Parallel(n_jobs=cpus)( delayed(generator.reads)( record, err_mod, n_pairs_per_cpu, i, args.output, args.gc_bias) for i in range(cpus)) temp_file_list.extend(record_file_name_list) except KeyboardInterrupt as e: logger.error('iss generate interrupted: %s' % e) generator.cleanup(temp_file_list) sys.exit(1) else: # remove the duplicates in file list and cleanup # we remove the duplicates in case two records had the same header # and reads were appended to the same temp file. temp_file_unique = list(set(temp_file_list)) generator.concatenate(temp_file_unique, args.output) generator.cleanup(temp_file_unique) logger.info('Read generation complete')