def run_pipeline(args): """ Run entire pipeline """ # Initialize reference data print("\nReading reference data") start = time() species = initialize_species(args) contigs = initialize_contigs(species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Build genome database for selected species if args['build_db']: print("\nBuilding database of representative genomes") args['log'].write("\nBuilding database of representative genomes\n") start = time() build_genome_db(args, species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use bowtie2 to map reads to a representative genome for each species if args['align']: args['file_type'] = utility.auto_detect_file_type(args['m1']) print("\nMapping reads to representative genomes") args['log'].write("\nMapping reads to representative genomes\n") start = time() genome_align(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use mpileup to identify SNPs if args['call']: start = time() print("\nRunning mpileup") args['log'].write("\nRunning mpileup\n") pileup(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Split pileup into files for each species, format, and report summary statistics print("\nFormatting output") args['log'].write("\nFormatting output\n") format_pileup(args, species, contigs) snps_summary(args, species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def pysam_pileup(args, species, contigs): start = time() print("\nCounting alleles") args['log'].write("\nCounting alleles\n") # run pileups per species in parallel argument_list = [] for species_id in species: argument_list.append([args, species_id, contigs]) aln_stats = utility.parallel(species_pileup, argument_list, args['threads']) # update alignment stats for species objects for species_id, stats in aln_stats: sp = species[species_id] sp.genome_length = stats['genome_length'] sp.covered_bases = stats['covered_bases'] sp.total_depth = stats['total_depth'] sp.aligned_reads = stats['aligned_reads'] sp.mapped_reads = stats['mapped_reads'] if sp.genome_length > 0: sp.fraction_covered = sp.covered_bases / float(sp.genome_length) if sp.covered_bases > 0: sp.mean_coverage = sp.total_depth / float(sp.covered_bases) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage())
def run_pipeline(args): """ Run entire pipeline """ # Initialize reference data print("\nReading reference data") start = time() if 'db' in args: if args.get('dbtoc'): args['iggdb'] = IGGdb(f"{args['dbtoc']}") else: args['iggdb'] = IGGdb(f"{args['db']}/metadata/species_info.tsv") species = initialize_species(args) genes = initialize_genes(args, species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Build pangenome database for selected species if args['build_db']: print("\nBuilding pangenome database") args['log'].write("\nBuilding pangenome database\n") start = time() build_pangenome_db(args, species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use bowtie2 to align reads to pangenome database if args['align']: start = time() print("\nAligning reads to pangenomes") args['log'].write("\nAligning reads to pangenomes\n") pangenome_align(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Compute pangenome coverage for each species if args['cov']: start = time() print("\nComputing coverage of pangenomes") args['log'].write("\nComputing coverage of pangenomes\n") pangenome_coverage(args, species, genes) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def run_pipeline(args): """ Run entire pipeline """ # Build genome database for selected GCs if args['build_db']: import species print("\nBuilding database of representative genomes") args['log'].write("\nBuilding database of representative genomes\n") start = time() genome_clusters = species.select_genome_clusters(args) build_genome_db(args, genome_clusters) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Use bowtie2 to map reads to a representative genome for each genome-cluster if args['align']: args['file_type'] = utility.auto_detect_file_type(args['m1']) print("\nMapping reads to representative genomes") args['log'].write("\nMapping reads to representative genomes\n") start = time() genome_align(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Use mpileup to identify SNPs if args['call']: start = time() print("\nRunning mpileup") args['log'].write("\nRunning mpileup\n") pileup(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Split vcf into files for each GC, format, and report summary statistics print("\nFormatting output") args['log'].write("\nFormatting output\n") split_vcf(args) format_vcf(args) snps_summary(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def run_pipeline(args): """ Run entire pipeline """ # Build genome database for selected GCs if args['build_db']: from midas.run import species print("\nBuilding database of representative genomes") args['log'].write("\nBuilding database of representative genomes\n") start = time() genome_clusters = species.select_genome_clusters(args) build_genome_db(args, genome_clusters) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use bowtie2 to map reads to a representative genome for each genome-cluster if args['align']: args['file_type'] = utility.auto_detect_file_type(args['m1']) print("\nMapping reads to representative genomes") args['log'].write("\nMapping reads to representative genomes\n") start = time() genome_align(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use mpileup to identify SNPs if args['call']: start = time() print("\nRunning mpileup") args['log'].write("\nRunning mpileup\n") pileup(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Split vcf into files for each GC, format, and report summary statistics print("\nFormatting output") args['log'].write("\nFormatting output\n") split_vcf(args) format_vcf(args) snps_summary(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def run_pipeline(args): """ Run entire pipeline """ # Initialize reference data print("\nReading reference data") start = time() if 'db' in args: if args.get('dbtoc'): args['iggdb'] = IGGdb(f"{args['dbtoc']}") else: args['iggdb'] = IGGdb(f"{args['db']}/metadata/species_info.tsv") species = initialize_species(args) contigs = initialize_contigs(species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Build genome database for selected species if args['build_db']: print("\nBuilding database of representative genomes") args['log'].write("\nBuilding database of representative genomes\n") start = time() build_genome_db(args, species) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use bowtie2 to map reads to a representative genome for each species if args['align']: args['file_type'] = utility.auto_detect_file_type(args['m1']) print("\nMapping reads to representative genomes") args['log'].write("\nMapping reads to representative genomes\n") start = time() genome_align(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use mpileup to identify SNPs if args['call']: index_bam(args) pysam_pileup(args, species, contigs) snps_summary(args, species) # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def estimate_abundance(args): """ Run entire pipeline """ # impute missing args & get relative file paths species_info = read_annotations(args) # align reads start = time() print("\nAligning reads to marker-genes database") args['log'].write("\nAligning reads to marker-genes database\n") map_reads_hsblast(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # find best hit for each read start = time() print("\nClassifying reads") args['log'].write("\nClassifying reads\n") best_hits = find_best_hits(args) unique_alns = assign_unique(args, best_hits, species_info) cluster_alns = assign_non_unique(args, best_hits, unique_alns) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # estimate genome cluster abundance start = time() print("\nEstimating species abundance") args['log'].write("\nEstimating species abundance\n") total_gene_length = read_gene_lengths(args, species_info) species_abundance = normalize_counts(cluster_alns, total_gene_length) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # write results write_abundance(args['outdir'], species_abundance, species_info) # clean up if args['remove_temp']: import shutil shutil.rmtree('%s/species/temp' % args['outdir'])
def run_pipeline(args): """ Run entire pipeline """ # read info files species_info = read_annotations(args) marker_info = read_marker_info(args) # align reads start = time() print("\nAligning reads to marker-genes database") args['log'].write("\nAligning reads to marker-genes database\n") map_reads_hsblast(args) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # find best hit for each read start = time() print("\nClassifying reads") args['log'].write("\nClassifying reads\n") best_hits = find_best_hits(args, marker_info) unique_alns = assign_unique(args, best_hits, species_info, marker_info) species_alns = assign_non_unique(args, best_hits, unique_alns, marker_info) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # estimate species abundance start = time() print("\nEstimating species abundance") args['log'].write("\nEstimating species abundance\n") total_gene_length = read_gene_lengths(args, species_info, marker_info) species_abundance = normalize_counts(species_alns, total_gene_length) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage()) # write results write_abundance(args['outdir'], species_abundance, species_info) # clean up if args['remove_temp']: import shutil shutil.rmtree('%s/species/temp' % args['outdir'])
def index_bam(args): start = time() print("\nIndexing bamfile") args['log'].write("\nIndexing bamfile\n") command = '%s index -@ %d %s/snps/temp/genomes.bam' % ( args['samtools'], int(args['threads']), args['outdir']) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage())
def run_pipeline(args): """ Run entire pipeline """ # Build pangenome database for selected GCs if args['build_db']: from midas.run import species print("\nBuilding pangenome database") args['log'].write("\nBuilding pangenome database\n") start = time() genome_clusters = species.select_genome_clusters(args) build_pangenome_db(args, genome_clusters) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Use bowtie2 to align reads to pangenome database if args['align']: start = time() print("\nAligning reads to pangenomes") args['log'].write("\nAligning reads to pangenomes\n") args['file_type'] = utility.auto_detect_file_type(args['m1']) pangenome_align(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Compute pangenome coverage for each species if args['cov']: start = time() print("\nComputing coverage of pangenomes") args['log'].write("\nComputing coverage of pangenomes\n") compute_pangenome_coverage(args) genes_summary(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory" % utility.max_mem_usage()) # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def run_pipeline(args): """ Run entire pipeline """ # Build pangenome database for selected GCs if args['build_db']: import species print("\nBuilding pangenome database") args['log'].write("\nBuilding pangenome database\n") start = time() genome_clusters = species.select_genome_clusters(args) build_pangenome_db(args, genome_clusters) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Use bowtie2 to align reads to pangenome database if args['align']: start = time() print("\nAligning reads to pangenomes") args['log'].write("\nAligning reads to pangenomes\n") args['file_type'] = utility.auto_detect_file_type(args['m1']) pangenome_align(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Compute pangenome coverage for each species if args['cov']: start = time() print("\nComputing coverage of pangenomes") args['log'].write("\nComputing coverage of pangenomes\n") compute_pangenome_coverage(args) genes_summary(args) print(" %s minutes" % round((time() - start)/60, 2) ) print(" %s Gb maximum memory") % utility.max_mem_usage() # Optionally remove temporary files if args['remove_temp']: remove_tmp(args)
def pysam_pileup(args, species, contigs): start = time() print("\nCounting alleles") args['log'].write("\nCounting alleles\n") # We cannot pass args to a subprocess unfortunately because args['log'] is an object; # so we can make it a global, although that is certainly living dangerously. # TODO: Just clean this up. global global_args global_args = args # run pileups per species in parallel argument_list = [] # We might not need this for contigs. It was an attempt to eliminate the nonserializable subprocess argument. Which is args. tsprint("Reading contigs") contigs = { str(c.id): { 'species_id': str(c.species_id), 'length': str(c.length), 'seq': "".join(c.seq) } for c in contigs.values() } for species_id in species: argument_list.append([species_id]) global global_contigs global_contigs = contigs tsprint("Read contigs") mp = multiprocessing.Pool(int(args['threads'])) # update alignment stats for species objects for species_id, stats in mp.starmap(species_pileup, argument_list): sp = species[species_id] sp.genome_length = int(stats['genome_length']) sp.covered_bases = int(stats['covered_bases']) sp.total_depth = int(stats['total_depth']) sp.aligned_reads = int(stats['aligned_reads']) sp.mapped_reads = int(stats['mapped_reads']) if sp.genome_length > 0: sp.fraction_covered = sp.covered_bases / float(sp.genome_length) if sp.covered_bases > 0: sp.mean_coverage = sp.total_depth / float(sp.covered_bases) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage())