def check_snps(): """ Check that accessions are sorted """ last_ref = None for snp in utility.parse_file(args['in']): last_ref = snp['site_id'].rsplit('|')[0] for snp in utility.parse_file(args['in']): ref_id = snp['site_id'].rsplit('|')[0] if ref_id < last_snp['ref_id']: sys.exit("Accessions not sorted") else: last_ref = ref_id
def check_snps(): """ Check that accessions are sorted """ last_ref = None for snp in utility.parse_file(args['in']): last_ref = snp['site_id'].rsplit('|')[0] for snp in utility.parse_file(args['in']): ref_id = snp['site_id'].rsplit('|')[0] if ref_id < last_snp['ref_id']: sys.exit("Accessions not sorted") else: last_ref = ref_id
def open_snp_info(indir): """ return generator for snps info file """ inpath = '%s/snps_info.txt' % indir if not os.path.isfile(inpath): return None else: return utility.parse_file(inpath)
def open_infiles(species_id, samples): """ Open SNP files for species across samples """ infiles = [] for sample in samples: inpath = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id) infiles.append(utility.parse_file(inpath)) return infiles
def genes_summary(args): """ Get summary of mapping statistics """ # store stats stats = {} inpath = '%s/%s' % (args['outdir'], 'genes/temp/pangenome.map') for species_id in set(utility.read_ref_to_cluster(inpath).values()): pangenome_size, covered_genes, total_coverage, marker_coverage = [0,0,0,0] for r in utility.parse_file('/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])): pangenome_size += 1 coverage = float(r['coverage']) normcov = float(r['copy_number']) if coverage > 0: covered_genes += 1 total_coverage += coverage if normcov > 0: marker_coverage = coverage/normcov stats[species_id] = {'pangenome_size':pangenome_size, 'covered_genes':covered_genes, 'fraction_covered':covered_genes/float(pangenome_size), 'mean_coverage':total_coverage/covered_genes if covered_genes > 0 else 0.0, 'marker_coverage':marker_coverage} # write stats fields = ['pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage'] outfile = open('/'.join([args['outdir'], 'genes/summary.txt']), 'w') outfile.write('\t'.join(['species_id'] + fields)+'\n') for species_id in stats: record = [species_id] + [str(stats[species_id][field]) for field in fields] outfile.write('\t'.join(record)+'\n')
def build_gene_matrices(species_id, samples, args): """ Compute gene copy numbers for samples """ gene_to_family = read_gene_map(species_id, args) count_genes = len(gene_to_family.keys()) count_genomes = len( set(['.'.join(x.split('.')[0:2]) for x in gene_to_family])) count_families = len(set(gene_to_family.values())) print(" %s genes from %s genomes" % (count_genes, count_genomes)) print(" clustered into %s families at %s percent id" % (count_families, args['cluster_pid'])) for sample in samples: sample.genes = {} for type in ['presabs', 'copynum', 'depth']: sample.genes[type] = defaultdict(float) inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, species_id) for r in utility.parse_file(inpath): if 'ref_id' in r: r['gene_id'] = r['ref_id'] # fix old fields if present if 'normalized_coverage' in r: r['copy_number'] = r['normalized_coverage'] if 'raw_coverage' in r: r['coverage'] = r['raw_coverage'] gene_id = gene_to_family[r['gene_id']] sample.genes['copynum'][gene_id] += float(r['copy_number']) sample.genes['depth'][gene_id] += float(r['coverage']) for sample in samples: for gene_id, copynum in sample.genes['copynum'].items(): if copynum >= args['min_copy']: sample.genes['presabs'][gene_id] = 1 else: sample.genes['presabs'][gene_id] = 0
def genes_summary(args): """ Get summary of mapping statistics """ # store stats stats = {} inpath = '%s/%s' % (args['outdir'], 'genes/temp/pangenome.map') for species_id in set(utility.read_ref_to_cluster(inpath).values()): pangenome_size, covered_genes, total_coverage, marker_coverage = [0,0,0,0] for r in utility.parse_file('/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])): pangenome_size += 1 coverage = float(r['coverage']) normcov = float(r['copy_number']) if coverage > 0: covered_genes += 1 total_coverage += coverage if normcov > 0: marker_coverage = coverage/normcov stats[species_id] = {'pangenome_size':pangenome_size, 'covered_genes':covered_genes, 'fraction_covered':covered_genes/float(pangenome_size), 'mean_coverage':total_coverage/covered_genes if covered_genes > 0 else 0.0, 'marker_coverage':marker_coverage} # write stats fields = ['pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage'] outfile = open('/'.join([args['outdir'], 'genes/summary.txt']), 'w') outfile.write('\t'.join(['species_id'] + fields)+'\n') for species_id in stats: record = [species_id] + [str(stats[species_id][field]) for field in fields] outfile.write('\t'.join(record)+'\n')
def snps_summary(args): """ Get summary of mapping statistics """ # store stats stats = {} inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map') ref_to_species = utility.read_ref_to_cluster(inpath) for species_id in set(ref_to_species.values()): genome_length, covered_bases, total_depth, identity, maf = [0,0,0,0,0] for r in utility.parse_file('/'.join([args['outdir'], 'snps/output/%s.snps.gz' % species_id])): genome_length += 1 depth = int(r['depth']) if depth > 0: covered_bases += 1 total_depth += depth if r['ref_allele'] == r['cons_allele']: identity += 1 ref_freq = float(r['ref_freq']) maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq stats[species_id] = {'genome_length':genome_length, 'covered_bases': covered_bases, 'fraction_covered':covered_bases/float(genome_length), 'mean_coverage':total_depth/float(covered_bases) if covered_bases > 0 else 0 } # write stats fields = ['genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage'] outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w') outfile.write('\t'.join(['species_id'] + fields)+'\n') for species_id in stats: record = [species_id] + [str(stats[species_id][field]) for field in fields] outfile.write('\t'.join(record)+'\n')
def open_infiles(species_id, samples): """ Open SNP files for species across samples """ infiles = [] for sample in samples: inpath = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id) infiles.append(utility.parse_file(inpath)) return infiles
def read_annotations(args): annotations = {} inpath = '%s/species_info.txt' % args['db'] if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath) for rec in utility.parse_file(inpath): annotations[rec['species_id']] = rec['species_name'] return annotations
def read_annotations(args): annotations = {} inpath = '%s/species_info.txt' % args['db'] if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath) for rec in utility.parse_file(inpath): annotations[rec['species_id']] = rec['species_name'] return annotations
def add_summary_stats(args, samples): """ add summary stats """ for rec in utility.parse_file(args['summary']): if 'run_accession' in rec: rec['sample_id'] = rec['run_accession'] if rec['sample_id'] in samples: samples[rec['sample_id']].mean_depth = float(rec['average_depth']) samples[rec['sample_id']].fraction_covered = float(rec['fraction_covered']) samples['between'].mean_depth = np.mean([_.mean_depth for _ in samples.values() if _.id != 'between'])
def read_cluster_map(sp, db, pid): sp.map = {} for ext in ['', '.gz']: path = '/'.join([db, 'pan_genomes', sp.id, 'gene_info.txt%s' % ext]) if os.path.isfile(path): sp.gene_info = path for r in utility.parse_file(sp.gene_info): sp.map[r['centroid_99']] = r['centroid_%s' % pid]
def read_stats(inpath, type): stats = {} for rec in utility.parse_file(inpath): if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id'] if 'phyeco_coverage' in rec: rec['marker_coverage'] = rec['phyeco_coverage'] if 'average_depth' in rec: rec['mean_coverage'] = rec['average_depth'] if type=='genes': rec['fraction_covered'] = float(rec['covered_genes'])/float(rec['pangenome_size']) stats[rec['species_id']] = rec return stats
def read_marker_info(args): """ Read info for marker genes from phyeco.fa """ info = {} for seq in Bio.SeqIO.parse('%s/marker_genes/phyeco.fa' % args['db'], 'fasta'): info[seq.id] = None for r in utility.parse_file('%s/marker_genes/phyeco.map' % args['db']): if r['gene_id'] in info: info[r['gene_id']] = r return info
def read_stats(inpath, type): stats = {} for rec in utility.parse_file(inpath): if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id'] if 'phyeco_coverage' in rec: rec['marker_coverage'] = rec['phyeco_coverage'] if 'average_depth' in rec: rec['mean_coverage'] = rec['average_depth'] if type=='genes': rec['fraction_covered'] = float(rec['covered_genes'])/float(rec['pangenome_size']) stats[rec['species_id']] = rec return stats
def fetch_samples(args): """ initialize samples from indir and set pass_qc flag """ samples = [] count_hq = 0 stop = False filters = {'min_depth':args['sample_depth'], 'min_cov':args['fract_cov']} for info in utility.parse_file('%s/snps_summary.txt' % args['indir']): samples.append(Sample(info, filters, stop)) if samples[-1].pass_qc: count_hq += 1 if count_hq >= args['max_samples']: stop = True return samples
def read_genes(db, species_id, contigs): """ Read in gene coordinates from features file """ genes_path = '%s/rep_genomes/%s/genome.features.gz' % (db, species_id) genes = [] for gene in utility.parse_file(genes_path): if gene['gene_type'] == 'RNA': continue else: gene['start'] = int(gene['start']) gene['end'] = int(gene['end']) gene['seq'] = get_gene_seq(gene, contigs[gene['scaffold_id']]) genes.append(gene) return genes
def read_abundance(inpath): """ Parse species abundance file """ if not os.path.isfile(inpath): sys.exit("\nCould not locate species profile: %s\nTry rerunning with run_midas.py species" % inpath) abun = {} for rec in utility.parse_file(inpath): # format record if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id'] if 'count_reads' in rec: rec['count_reads'] = int(rec['count_reads']) if 'coverage' in rec: rec['coverage'] = float(rec['coverage']) if 'relative_abundance' in rec: rec['relative_abundance'] = float(rec['relative_abundance']) abun[rec['species_id']] = rec return abun
def call_markers(species, samples, args): """ determine if marker present in each sample """ # open marker list markers = utility.parse_file(species.paths['markers']) marker = fetch_marker(markers) # dictionary for 1st marker allele if marker is None: sys.exit("\nError: no marker alleles found in file: %s\n" % species.paths['markers']) # init markers per sample for sample in samples.values(): sample.markers = set([]) # loop over sites sites = parse.fetch_sites(species, samples) for index, site in enumerate(sites): # record progress if not index % 100000: print("%s sites processed" % index) # stop early if index >= args['max_sites']: break # skip sites not in marker list if (site.ref_id != marker['ref_id'] or site.ref_pos < marker['ref_pos']): continue # determine if marker present in each sample for sample in site.samples.values(): # skip samples without marker if sample.depth == 0: continue elif marker['allele'] == site.ref_allele: sample.marker_freq = sample.ref_freq elif marker['allele'] == sample.alt_allele: sample.marker_freq = 1 - sample.ref_freq else: continue # record marker allele for sample_id sample.marker_count = round(sample.marker_freq * sample.depth) if (sample.marker_freq >= args['min_freq'] and sample.marker_count >= args['min_reads']): sample.markers.add(site.id) # fetch next marker allele marker = fetch_marker(markers) if marker is None: break
def read_genes(db, species_id, contigs): """ Read in gene coordinates from features file """ genes_path = '%s/genome_clusters/%s/genome.features.gz' % (db, species_id) genes = [] for gene in utility.parse_file(genes_path): if gene['gene_type'] == 'rna': continue else: gene['accession'] = 'accn|%s' % gene['accession'] gene['start'] = int(gene['start']) gene['end'] = int(gene['end']) gene['seq'] = get_gene_seq(gene, contigs[gene['accession']]) genes.append(gene) return genes
def read_genes(db, species_id, contigs): """ Read in gene coordinates from features file """ genes_path = '%s/genome_clusters/%s/genome.features.gz' % (db, species_id) genes = [] for gene in utility.parse_file(genes_path): if gene['gene_type'] == 'rna': continue else: gene['accession'] = 'accn|%s' % gene['accession'] gene['start'] = int(gene['start']) gene['end'] = int(gene['end']) gene['seq'] = get_gene_seq(gene, contigs[gene['accession']]) genes.append(gene) return genes
def select_species(args, type='genes'): """ Select all species with a minimum number of high-coverage samples""" # read species annotations species_info = {} inpath = os.path.join(args['db'], 'species_info.txt') for rec in utility.parse_file(inpath): species_info[rec['species_id']] = rec # fetch all species with at least 1 sample species = {} for sample in load_samples(args): if not sample.paths[type]: sys.stderr.write("Warning: no %s output for sample: %s\n" % (type, sample.dir)) continue for id, info in read_stats(sample.paths[type], type).items(): if (args['species_id'] and id not in args['species_id'].split(',')): continue # skip unspecified species elif (args['max_samples'] and id in species and len(species[id].samples) >= args['max_samples']): continue # skip species with too many samples elif float(info['mean_coverage']) < args['sample_depth']: continue # skip low-coverage sample elif type == 'snps' and float( info['fraction_covered']) < args['fract_cov']: continue # skip low-coverage sample if id not in species: species[id] = Species(id, species_info) # initialize new species species[id].samples.append(sample) # append sample # dict to list species = list(species.values()) # remove species with an insufficient number of samples species = [ sp for sp in species if len(sp.samples) >= int(args['min_samples']) ] # sort by number of samples species = sort_species(species) # select a subset of species to analyze if args['max_species'] is not None and len(species) > args['max_species']: species = species[0:args['max_species']] # create species directories for sp in species: outdir = os.path.join(args['outdir'], sp.id) if not os.path.isdir(outdir): os.mkdir(outdir) print(" found %s species with sufficient high-coverage samples\n" % len(species)) return species
def parse_sites(indir, samples): """ yield genomic sites from input files """ index = 0 files = {} # open input files for ext in ['alt_allele', 'depth', 'ref_freq']: files[ext] = parse_tsv('%s/snps_%s.txt' % (indir, ext)) info = utility.parse_file('%s/snps_info.txt' % indir) while True: # yield GenomicSite site = GenomicSite(files, samples, info) if not site.id: break else: index += 1 yield site for file in files.values(): # close input files file.close()
def read_abundance(inpath): """ Parse species abundance file """ if not os.path.isfile(inpath): sys.exit( "\nCould not locate species profile: %s\nTry rerunning with run_midas.py species" % inpath) abun = {} for rec in utility.parse_file(inpath): # format record if 'species_id' in rec: rec['species_id'] = rec['species_id'] if 'count_reads' in rec: rec['count_reads'] = int(rec['count_reads']) if 'coverage' in rec: rec['coverage'] = float(rec['coverage']) if 'relative_abundance' in rec: rec['relative_abundance'] = float(rec['relative_abundance']) abun[rec['species_id']] = rec return abun
def parse_sites(indir, site_depth=0, info_path=None, max_sites=None): """ Parse reference frequency matrix and snp info file """ index = 0 sample_ids = list_sample_ids('%s/snps_ref_freq.txt' % indir) freq = open('%s/snps_ref_freq.txt' % indir); next(freq) depth = open('%s/snps_depth.txt' % indir); next(depth) alleles = open('%s/snps_alt_allele.txt' % indir); next(alleles) info = utility.parse_file(info_path) if info_path else None while True: site = GenomicSite(sample_ids, freq, depth, alleles, site_depth, info) if not site.id: break elif max_sites and index >= max_sites: break else: index += 1 yield site freq.close(); depth.close()
def select_species(args, type='genes'): """ Select all species with a minimum number of high-coverage samples""" # read species annotations species_info = {} inpath = os.path.join(args['db'], 'species_info.txt') for rec in utility.parse_file(inpath): species_info[rec['species_id']] = rec # fetch all species with at least 1 sample species = {} for sample in load_samples(args): if not sample.paths[type]: sys.stderr.write("Warning: no genes output for sample: %s\n" % sample.dir) continue for id, info in read_stats(sample.paths[type], type).items(): if (args['species_id'] and id not in args['species_id'].split(',')): continue # skip unspecified species elif (args['max_samples'] and id in species and len(species[id].samples) >= args['max_samples']): continue # skip species with too many samples elif float(info['mean_coverage']) < args['sample_depth']: continue # skip low-coverage sample elif type=='snps' and float(info['fraction_covered']) < args['fract_cov']: continue # skip low-coverage sample if id not in species: species[id] = Species(id, species_info) # initialize new species species[id].samples.append(sample) # append sample # dict to list species = species.values() # remove species with an insufficient number of samples species = [sp for sp in species if len(sp.samples) >= args['min_samples']] # sort by number of samples species = sort_species(species) # select a subset of species to analyze if args['max_species'] is not None and len(species) > args['max_species']: species = species[0:args['max_species']] # create species directories for sp in species: outdir = os.path.join(args['outdir'], sp.id) if not os.path.isdir(outdir): os.mkdir(outdir) print " found %s species with sufficient high-coverage samples\n" % len(species) return species
def snps_summary(args): """ Get summary of mapping statistics """ # store stats stats = {} inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map') ref_to_species = utility.read_ref_to_cluster(inpath) for species_id in set(ref_to_species.values()): genome_length, covered_bases, total_depth, identity, maf = [ 0, 0, 0, 0, 0 ] for r in utility.parse_file('/'.join( [args['outdir'], 'snps/output/%s.snps.gz' % species_id])): genome_length += 1 depth = int(r['depth']) if depth > 0: covered_bases += 1 total_depth += depth if r['ref_allele'] == r['cons_allele']: identity += 1 ref_freq = float(r['ref_freq']) maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq stats[species_id] = { 'genome_length': genome_length, 'covered_bases': covered_bases, 'fraction_covered': covered_bases / float(genome_length), 'mean_coverage': total_depth / float(covered_bases) if covered_bases > 0 else 0 } # write stats fields = [ 'genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage' ] outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w') outfile.write('\t'.join(['species_id'] + fields) + '\n') for species_id in stats: record = [species_id ] + [str(stats[species_id][field]) for field in fields] outfile.write('\t'.join(record) + '\n')
def fetch_samples(args): """ List and select samples from input """ from midas.analyze.diversity import Sample samples = [] for info in parse_file('%s/snps_summary.txt' % args['indir']): # init sample sample = Sample(info) sample.pass_qc = True # keep/exclude sample if sample.filter(args['sample_depth'], args['fract_cov']): sample.pass_qc = False if args['keep_samples'] and sample.id not in args['keep_samples']: sample.pass_qc = False if args['exclude_samples'] and sample.id in args['exclude_samples']: sample.pass_qc = False if sum([1 for s in samples if s.pass_qc]) >= args['max_samples']: sample.pass_qc = False # store sample samples.append(sample) # select random sample if args['rand_samples']: resample_samples(samples) return samples
def build_gene_matrices(sp, min_copy): """ Compute gene copy numbers for samples """ for sample in sp.samples: sample.genes = {} for field, dtype in [('presabs', float), ('copynum', float), ('depth', float), ('reads', int)]: sample.genes[field] = defaultdict(dtype) inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, sp.id) for r in utility.parse_file(inpath): if 'ref_id' in r: r['gene_id'] = r['ref_id'] # fix old fields if present if 'normalized_coverage' in r: r['copy_number'] = r['normalized_coverage'] if 'raw_coverage' in r: r['coverage'] = r['raw_coverage'] gene_id = sp.map[r['gene_id']] sample.genes['copynum'][gene_id] += float(r['copy_number']) sample.genes['depth'][gene_id] += float(r['coverage']) sample.genes['reads'][gene_id] += int( r['count_reads']) if 'count_reads' in r else 0 for sample in sp.samples: for gene_id, copynum in sample.genes['copynum'].items(): if copynum >= min_copy: sample.genes['presabs'][gene_id] = 1 else: sample.genes['presabs'][gene_id] = 0
def build_gene_matrices(species_id, samples, args): """ Compute gene copy numbers for samples """ gene_to_family = read_gene_map(species_id, args) count_genes = len(gene_to_family.keys()) count_genomes = len(set(['.'.join(x.split('.')[0:2]) for x in gene_to_family])) count_families = len(set(gene_to_family.values())) print(" %s genes from %s genomes" % (count_genes, count_genomes)) print(" clustered into %s families at %s percent id" % (count_families, args['cluster_pid'])) for sample in samples: sample.genes = {} for type in ['presabs', 'copynum', 'depth']: sample.genes[type] = defaultdict(float) inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, species_id) for r in utility.parse_file(inpath): if 'normalized_coverage' in r: r['copy_number'] = r['normalized_coverage'] if 'raw_coverage' in r: r['coverage'] = r['raw_coverage'] gene_id = gene_to_family[r['gene_id']] sample.genes['copynum'][gene_id] += float(r['copy_number']) sample.genes['depth'][gene_id] += float(r['coverage']) for sample in samples: for gene_id, copynum in sample.genes['copynum'].items(): if copynum >= args['min_copy']: sample.genes['presabs'][gene_id] = 1 else: sample.genes['presabs'][gene_id] = 0
def snps_summary(args, species): """ Get summary of mapping statistics """ # store stats stats = {} for species_id in species: genome_length, covered_bases, total_depth, maf = [0, 0, 0, 0] for r in utility.parse_file('/'.join( [args['outdir'], 'snps/output/%s.snps.gz' % species_id])): genome_length += 1 depth = int(r['depth']) if depth > 0: covered_bases += 1 total_depth += depth ref_freq = float(r['ref_freq']) maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq fraction_covered = covered_bases / float(genome_length) mean_coverage = total_depth / float( covered_bases) if covered_bases > 0 else 0 stats[species_id] = { 'genome_length': genome_length, 'covered_bases': covered_bases, 'fraction_covered': fraction_covered, 'mean_coverage': mean_coverage } # write stats fields = [ 'genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage' ] outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w') outfile.write('\t'.join(['species_id'] + fields) + '\n') for species_id in stats: record = [species_id ] + [str(stats[species_id][field]) for field in fields] outfile.write('\t'.join(record) + '\n') outfile.close()
def list_genes(args): """ List the set of genes from snps_info.txt """ genes = set([]) for r in parse_file('%s/snps_info.txt' % args['indir']): if r['gene_id'] != '': genes.add(r['gene_id']) return genes
def read_annotations(args): info = {} inpath = '%s/species_info.txt' % args['db'] for r in utility.parse_file(inpath): info[r['species_id']] = r return info