Exemplo n.º 1
0
def check_snps():
    """ Check that accessions are sorted """
    last_ref = None
    for snp in utility.parse_file(args['in']):
        last_ref = snp['site_id'].rsplit('|')[0]
    for snp in utility.parse_file(args['in']):
        ref_id = snp['site_id'].rsplit('|')[0]
        if ref_id < last_snp['ref_id']:
            sys.exit("Accessions not sorted")
        else:
            last_ref = ref_id
Exemplo n.º 2
0
def check_snps():
	""" Check that accessions are sorted """
	last_ref = None
	for snp in utility.parse_file(args['in']):
		last_ref = snp['site_id'].rsplit('|')[0]
	for snp in utility.parse_file(args['in']):
		ref_id = snp['site_id'].rsplit('|')[0]
		if ref_id < last_snp['ref_id']:
			sys.exit("Accessions not sorted")
		else:
			last_ref = ref_id
Exemplo n.º 3
0
def open_snp_info(indir):
    """ return generator for snps info file """
    inpath = '%s/snps_info.txt' % indir
    if not os.path.isfile(inpath):
        return None
    else:
        return utility.parse_file(inpath)
Exemplo n.º 4
0
def open_infiles(species_id, samples):
    """ Open SNP files for species across samples """
    infiles = []
    for sample in samples:
        inpath = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id)
        infiles.append(utility.parse_file(inpath))
    return infiles
Exemplo n.º 5
0
Arquivo: genes.py Projeto: palc/MIDAS
def genes_summary(args):
	""" Get summary of mapping statistics """
	# store stats
	stats = {}
	inpath = '%s/%s' % (args['outdir'], 'genes/temp/pangenome.map')
	for species_id in set(utility.read_ref_to_cluster(inpath).values()):
		pangenome_size, covered_genes, total_coverage, marker_coverage = [0,0,0,0]
		for r in utility.parse_file('/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])):
			pangenome_size += 1
			coverage = float(r['coverage'])
			normcov = float(r['copy_number'])
			if coverage > 0:
				covered_genes += 1
				total_coverage += coverage
			if normcov > 0:
				marker_coverage = coverage/normcov
		stats[species_id] = {'pangenome_size':pangenome_size,
							 'covered_genes':covered_genes,
							 'fraction_covered':covered_genes/float(pangenome_size),
							 'mean_coverage':total_coverage/covered_genes if covered_genes > 0 else 0.0,
							 'marker_coverage':marker_coverage}
	# write stats
	fields = ['pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage']
	outfile = open('/'.join([args['outdir'], 'genes/summary.txt']), 'w')
	outfile.write('\t'.join(['species_id'] + fields)+'\n')
	for species_id in stats:
		record = [species_id] + [str(stats[species_id][field]) for field in fields]
		outfile.write('\t'.join(record)+'\n')
Exemplo n.º 6
0
def build_gene_matrices(species_id, samples, args):
    """ Compute gene copy numbers for samples """
    gene_to_family = read_gene_map(species_id, args)
    count_genes = len(gene_to_family.keys())
    count_genomes = len(
        set(['.'.join(x.split('.')[0:2]) for x in gene_to_family]))
    count_families = len(set(gene_to_family.values()))
    print("    %s genes from %s genomes" % (count_genes, count_genomes))
    print("    clustered into %s families at %s percent id" %
          (count_families, args['cluster_pid']))
    for sample in samples:
        sample.genes = {}
        for type in ['presabs', 'copynum', 'depth']:
            sample.genes[type] = defaultdict(float)
        inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, species_id)
        for r in utility.parse_file(inpath):
            if 'ref_id' in r:
                r['gene_id'] = r['ref_id']  # fix old fields if present
            if 'normalized_coverage' in r:
                r['copy_number'] = r['normalized_coverage']
            if 'raw_coverage' in r: r['coverage'] = r['raw_coverage']
            gene_id = gene_to_family[r['gene_id']]
            sample.genes['copynum'][gene_id] += float(r['copy_number'])
            sample.genes['depth'][gene_id] += float(r['coverage'])
    for sample in samples:
        for gene_id, copynum in sample.genes['copynum'].items():
            if copynum >= args['min_copy']:
                sample.genes['presabs'][gene_id] = 1
            else:
                sample.genes['presabs'][gene_id] = 0
Exemplo n.º 7
0
def genes_summary(args):
	""" Get summary of mapping statistics """
	# store stats
	stats = {}
	inpath = '%s/%s' % (args['outdir'], 'genes/temp/pangenome.map')
	for species_id in set(utility.read_ref_to_cluster(inpath).values()):
		pangenome_size, covered_genes, total_coverage, marker_coverage = [0,0,0,0]
		for r in utility.parse_file('/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])):
			pangenome_size += 1
			coverage = float(r['coverage'])
			normcov = float(r['copy_number'])
			if coverage > 0:
				covered_genes += 1
				total_coverage += coverage
			if normcov > 0:
				marker_coverage = coverage/normcov
		stats[species_id] = {'pangenome_size':pangenome_size,
							 'covered_genes':covered_genes,
							 'fraction_covered':covered_genes/float(pangenome_size),
							 'mean_coverage':total_coverage/covered_genes if covered_genes > 0 else 0.0,
							 'marker_coverage':marker_coverage}
	# write stats
	fields = ['pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage']
	outfile = open('/'.join([args['outdir'], 'genes/summary.txt']), 'w')
	outfile.write('\t'.join(['species_id'] + fields)+'\n')
	for species_id in stats:
		record = [species_id] + [str(stats[species_id][field]) for field in fields]
		outfile.write('\t'.join(record)+'\n')
Exemplo n.º 8
0
def snps_summary(args):
	""" Get summary of mapping statistics """
	# store stats
	stats = {}
	inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map')
	ref_to_species = utility.read_ref_to_cluster(inpath)
	for species_id in set(ref_to_species.values()):
		genome_length, covered_bases, total_depth, identity, maf = [0,0,0,0,0]
		for r in utility.parse_file('/'.join([args['outdir'], 'snps/output/%s.snps.gz' % species_id])):
			genome_length += 1
			depth = int(r['depth'])
			if depth > 0:
				covered_bases += 1
				total_depth += depth
				if r['ref_allele'] == r['cons_allele']:
					identity += 1
				ref_freq = float(r['ref_freq'])
				maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq
		stats[species_id] = {'genome_length':genome_length,
							 'covered_bases': covered_bases,
							 'fraction_covered':covered_bases/float(genome_length),
							 'mean_coverage':total_depth/float(covered_bases) if covered_bases > 0 else 0
							 }
	# write stats
	fields = ['genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage']
	outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w')
	outfile.write('\t'.join(['species_id'] + fields)+'\n')
	for species_id in stats:
		record = [species_id] + [str(stats[species_id][field]) for field in fields]
		outfile.write('\t'.join(record)+'\n')
Exemplo n.º 9
0
def open_infiles(species_id, samples):
	""" Open SNP files for species across samples """
	infiles = []
	for sample in samples:
		inpath = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id)
		infiles.append(utility.parse_file(inpath))
	return infiles
Exemplo n.º 10
0
def read_annotations(args):
	annotations = {}
	inpath = '%s/species_info.txt' % args['db']
	if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath)
	for rec in utility.parse_file(inpath):
		annotations[rec['species_id']] = rec['species_name']
	return annotations
Exemplo n.º 11
0
def read_annotations(args):
    annotations = {}
    inpath = '%s/species_info.txt' % args['db']
    if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath)
    for rec in utility.parse_file(inpath):
        annotations[rec['species_id']] = rec['species_name']
    return annotations
Exemplo n.º 12
0
def add_summary_stats(args, samples):
	""" add summary stats """
	for rec in utility.parse_file(args['summary']):
		if 'run_accession' in rec: rec['sample_id'] = rec['run_accession']
		if rec['sample_id'] in samples:
			samples[rec['sample_id']].mean_depth = float(rec['average_depth'])
			samples[rec['sample_id']].fraction_covered = float(rec['fraction_covered'])
	samples['between'].mean_depth = np.mean([_.mean_depth for _ in samples.values() if _.id != 'between'])
Exemplo n.º 13
0
def read_cluster_map(sp, db, pid):
    sp.map = {}
    for ext in ['', '.gz']:
        path = '/'.join([db, 'pan_genomes', sp.id, 'gene_info.txt%s' % ext])
        if os.path.isfile(path):
            sp.gene_info = path
    for r in utility.parse_file(sp.gene_info):
        sp.map[r['centroid_99']] = r['centroid_%s' % pid]
Exemplo n.º 14
0
Arquivo: merge.py Projeto: palc/MIDAS
def read_stats(inpath, type):
	stats = {}
	for rec in utility.parse_file(inpath):
		if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id']
		if 'phyeco_coverage' in rec: rec['marker_coverage'] = rec['phyeco_coverage']
		if 'average_depth' in rec: rec['mean_coverage'] = rec['average_depth']
		if type=='genes':
			rec['fraction_covered'] = float(rec['covered_genes'])/float(rec['pangenome_size'])
		stats[rec['species_id']] = rec
	return stats
Exemplo n.º 15
0
def read_marker_info(args):
    """ Read info for marker genes from phyeco.fa """
    info = {}
    for seq in Bio.SeqIO.parse('%s/marker_genes/phyeco.fa' % args['db'],
                               'fasta'):
        info[seq.id] = None
    for r in utility.parse_file('%s/marker_genes/phyeco.map' % args['db']):
        if r['gene_id'] in info:
            info[r['gene_id']] = r
    return info
Exemplo n.º 16
0
def read_stats(inpath, type):
	stats = {}
	for rec in utility.parse_file(inpath):
		if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id']
		if 'phyeco_coverage' in rec: rec['marker_coverage'] = rec['phyeco_coverage']
		if 'average_depth' in rec: rec['mean_coverage'] = rec['average_depth']
		if type=='genes':
			rec['fraction_covered'] = float(rec['covered_genes'])/float(rec['pangenome_size'])
		stats[rec['species_id']] = rec
	return stats
Exemplo n.º 17
0
def fetch_samples(args):
	""" initialize samples from indir and set pass_qc flag """
	samples = []
	count_hq = 0
	stop = False
	filters = {'min_depth':args['sample_depth'], 'min_cov':args['fract_cov']}
	for info in utility.parse_file('%s/snps_summary.txt' % args['indir']):
		samples.append(Sample(info, filters, stop))
		if samples[-1].pass_qc: count_hq += 1
		if count_hq >= args['max_samples']: stop = True
	return samples
Exemplo n.º 18
0
def read_genes(db, species_id, contigs):
    """ Read in gene coordinates from features file """
    genes_path = '%s/rep_genomes/%s/genome.features.gz' % (db, species_id)
    genes = []
    for gene in utility.parse_file(genes_path):
        if gene['gene_type'] == 'RNA':
            continue
        else:
            gene['start'] = int(gene['start'])
            gene['end'] = int(gene['end'])
            gene['seq'] = get_gene_seq(gene, contigs[gene['scaffold_id']])
            genes.append(gene)
    return genes
Exemplo n.º 19
0
def read_abundance(inpath):
	""" Parse species abundance file """
	if not os.path.isfile(inpath):
		sys.exit("\nCould not locate species profile: %s\nTry rerunning with run_midas.py species" % inpath)
	abun = {}
	for rec in utility.parse_file(inpath):
		# format record
		if 'cluster_id' in rec: rec['species_id'] = rec['cluster_id']
		if 'count_reads' in rec: rec['count_reads'] = int(rec['count_reads'])
		if 'coverage' in rec: rec['coverage'] = float(rec['coverage'])
		if 'relative_abundance' in rec: rec['relative_abundance'] = float(rec['relative_abundance'])
		abun[rec['species_id']] = rec
	return abun
Exemplo n.º 20
0
def call_markers(species, samples, args):
    """ determine if marker present in each sample """

    # open marker list
    markers = utility.parse_file(species.paths['markers'])
    marker = fetch_marker(markers)  # dictionary for 1st marker allele
    if marker is None:
        sys.exit("\nError: no marker alleles found in file: %s\n" %
                 species.paths['markers'])

    # init markers per sample
    for sample in samples.values():
        sample.markers = set([])

    # loop over sites
    sites = parse.fetch_sites(species, samples)
    for index, site in enumerate(sites):

        # record progress
        if not index % 100000: print("%s sites processed" % index)

        # stop early
        if index >= args['max_sites']: break

        # skip sites not in marker list
        if (site.ref_id != marker['ref_id']
                or site.ref_pos < marker['ref_pos']):
            continue

        # determine if marker present in each sample
        for sample in site.samples.values():

            # skip samples without marker
            if sample.depth == 0:
                continue
            elif marker['allele'] == site.ref_allele:
                sample.marker_freq = sample.ref_freq
            elif marker['allele'] == sample.alt_allele:
                sample.marker_freq = 1 - sample.ref_freq
            else:
                continue

            # record marker allele for sample_id
            sample.marker_count = round(sample.marker_freq * sample.depth)
            if (sample.marker_freq >= args['min_freq']
                    and sample.marker_count >= args['min_reads']):
                sample.markers.add(site.id)

        # fetch next marker allele
        marker = fetch_marker(markers)
        if marker is None: break
Exemplo n.º 21
0
def read_genes(db, species_id, contigs):
    """ Read in gene coordinates from features file """
    genes_path = '%s/genome_clusters/%s/genome.features.gz' % (db, species_id)
    genes = []
    for gene in utility.parse_file(genes_path):
        if gene['gene_type'] == 'rna':
            continue
        else:
            gene['accession'] = 'accn|%s' % gene['accession']
            gene['start'] = int(gene['start'])
            gene['end'] = int(gene['end'])
            gene['seq'] = get_gene_seq(gene, contigs[gene['accession']])
            genes.append(gene)
    return genes
Exemplo n.º 22
0
def read_genes(db, species_id, contigs):
	""" Read in gene coordinates from features file """
	genes_path = '%s/genome_clusters/%s/genome.features.gz' % (db, species_id)
	genes = []
	for gene in utility.parse_file(genes_path):
		if gene['gene_type'] == 'rna':
			continue
		else:
			gene['accession'] = 'accn|%s' % gene['accession']
			gene['start'] = int(gene['start'])
			gene['end'] = int(gene['end'])
			gene['seq'] = get_gene_seq(gene, contigs[gene['accession']])
			genes.append(gene)
	return genes
Exemplo n.º 23
0
def select_species(args, type='genes'):
    """ Select all species with a minimum number of high-coverage samples"""
    # read species annotations
    species_info = {}
    inpath = os.path.join(args['db'], 'species_info.txt')
    for rec in utility.parse_file(inpath):
        species_info[rec['species_id']] = rec
    # fetch all species with at least 1 sample
    species = {}
    for sample in load_samples(args):
        if not sample.paths[type]:
            sys.stderr.write("Warning: no %s output for sample: %s\n" %
                             (type, sample.dir))
            continue
        for id, info in read_stats(sample.paths[type], type).items():
            if (args['species_id']
                    and id not in args['species_id'].split(',')):
                continue  # skip unspecified species
            elif (args['max_samples'] and id in species
                  and len(species[id].samples) >= args['max_samples']):
                continue  # skip species with too many samples
            elif float(info['mean_coverage']) < args['sample_depth']:
                continue  # skip low-coverage sample
            elif type == 'snps' and float(
                    info['fraction_covered']) < args['fract_cov']:
                continue  # skip low-coverage sample
            if id not in species:
                species[id] = Species(id,
                                      species_info)  # initialize new species
            species[id].samples.append(sample)  # append sample
    # dict to list
    species = list(species.values())
    # remove species with an insufficient number of samples
    species = [
        sp for sp in species if len(sp.samples) >= int(args['min_samples'])
    ]
    # sort by number of samples
    species = sort_species(species)
    # select a subset of species to analyze
    if args['max_species'] is not None and len(species) > args['max_species']:
        species = species[0:args['max_species']]
    # create species directories
    for sp in species:
        outdir = os.path.join(args['outdir'], sp.id)
        if not os.path.isdir(outdir): os.mkdir(outdir)
    print("  found %s species with sufficient high-coverage samples\n" %
          len(species))
    return species
Exemplo n.º 24
0
def parse_sites(indir, samples):
	""" yield genomic sites from input files """
	index = 0
	files = {} # open input files
	for ext in ['alt_allele', 'depth', 'ref_freq']:
		files[ext] = parse_tsv('%s/snps_%s.txt' % (indir, ext))
	info = utility.parse_file('%s/snps_info.txt' % indir)
	while True: # yield GenomicSite
		site = GenomicSite(files, samples, info)
		if not site.id:
			break
		else:
			index += 1
			yield site
	for file in files.values(): # close input files
		file.close()
Exemplo n.º 25
0
def read_abundance(inpath):
    """ Parse species abundance file """
    if not os.path.isfile(inpath):
        sys.exit(
            "\nCould not locate species profile: %s\nTry rerunning with run_midas.py species"
            % inpath)
    abun = {}
    for rec in utility.parse_file(inpath):
        # format record
        if 'species_id' in rec: rec['species_id'] = rec['species_id']
        if 'count_reads' in rec: rec['count_reads'] = int(rec['count_reads'])
        if 'coverage' in rec: rec['coverage'] = float(rec['coverage'])
        if 'relative_abundance' in rec:
            rec['relative_abundance'] = float(rec['relative_abundance'])
        abun[rec['species_id']] = rec
    return abun
Exemplo n.º 26
0
def parse_sites(indir, site_depth=0, info_path=None, max_sites=None):
	""" Parse reference frequency matrix and snp info file """
	index = 0
	sample_ids = list_sample_ids('%s/snps_ref_freq.txt' % indir)
	freq = open('%s/snps_ref_freq.txt' % indir); next(freq)
	depth = open('%s/snps_depth.txt' % indir); next(depth)
	alleles = open('%s/snps_alt_allele.txt' % indir); next(alleles)
	info = utility.parse_file(info_path) if info_path else None
	while True:
		site = GenomicSite(sample_ids, freq, depth, alleles, site_depth, info)
		if not site.id:
			break
		elif max_sites and index >= max_sites:
			break
		else:
			index += 1
			yield site
	freq.close(); depth.close()
Exemplo n.º 27
0
def select_species(args, type='genes'):
	""" Select all species with a minimum number of high-coverage samples"""
	# read species annotations
	species_info = {}
	inpath = os.path.join(args['db'], 'species_info.txt')
	for rec in utility.parse_file(inpath):
		species_info[rec['species_id']] = rec
	# fetch all species with at least 1 sample
	species = {}
	for sample in load_samples(args):
		if not sample.paths[type]:
			sys.stderr.write("Warning: no genes output for sample: %s\n" % sample.dir)
			continue
		for id, info in read_stats(sample.paths[type], type).items():
			if (args['species_id']
					and id not in args['species_id'].split(',')):
				continue # skip unspecified species
			elif (args['max_samples']
					and id in species
					and len(species[id].samples) >= args['max_samples']):
				continue # skip species with too many samples
			elif float(info['mean_coverage']) < args['sample_depth']:
				continue # skip low-coverage sample
			elif type=='snps' and float(info['fraction_covered']) < args['fract_cov']:
				continue # skip low-coverage sample
			if id not in species:
				species[id] = Species(id, species_info) # initialize new species
			species[id].samples.append(sample) # append sample
	# dict to list
	species = species.values()
	# remove species with an insufficient number of samples
	species = [sp for sp in species if len(sp.samples) >= args['min_samples']]
	# sort by number of samples
	species = sort_species(species)
	# select a subset of species to analyze
	if args['max_species'] is not None and len(species) > args['max_species']:
		species = species[0:args['max_species']]
	# create species directories
	for sp in species:
		outdir = os.path.join(args['outdir'], sp.id)
		if not os.path.isdir(outdir): os.mkdir(outdir)
	print "  found %s species with sufficient high-coverage samples\n" % len(species)
	return species
Exemplo n.º 28
0
Arquivo: snps.py Projeto: palc/MIDAS
def snps_summary(args):
    """ Get summary of mapping statistics """
    # store stats
    stats = {}
    inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map')
    ref_to_species = utility.read_ref_to_cluster(inpath)
    for species_id in set(ref_to_species.values()):
        genome_length, covered_bases, total_depth, identity, maf = [
            0, 0, 0, 0, 0
        ]
        for r in utility.parse_file('/'.join(
            [args['outdir'],
             'snps/output/%s.snps.gz' % species_id])):
            genome_length += 1
            depth = int(r['depth'])
            if depth > 0:
                covered_bases += 1
                total_depth += depth
                if r['ref_allele'] == r['cons_allele']:
                    identity += 1
                ref_freq = float(r['ref_freq'])
                maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq
        stats[species_id] = {
            'genome_length':
            genome_length,
            'covered_bases':
            covered_bases,
            'fraction_covered':
            covered_bases / float(genome_length),
            'mean_coverage':
            total_depth / float(covered_bases) if covered_bases > 0 else 0
        }
    # write stats
    fields = [
        'genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage'
    ]
    outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w')
    outfile.write('\t'.join(['species_id'] + fields) + '\n')
    for species_id in stats:
        record = [species_id
                  ] + [str(stats[species_id][field]) for field in fields]
        outfile.write('\t'.join(record) + '\n')
Exemplo n.º 29
0
def fetch_samples(args):
    """ List and select samples from input """
    from midas.analyze.diversity import Sample
    samples = []
    for info in parse_file('%s/snps_summary.txt' % args['indir']):
        # init sample
        sample = Sample(info)
        sample.pass_qc = True
        # keep/exclude sample
        if sample.filter(args['sample_depth'], args['fract_cov']):
            sample.pass_qc = False
        if args['keep_samples'] and sample.id not in args['keep_samples']:
            sample.pass_qc = False
        if args['exclude_samples'] and sample.id in args['exclude_samples']:
            sample.pass_qc = False
        if sum([1 for s in samples if s.pass_qc]) >= args['max_samples']:
            sample.pass_qc = False
        # store sample
        samples.append(sample)
    # select random sample
    if args['rand_samples']: resample_samples(samples)
    return samples
Exemplo n.º 30
0
def build_gene_matrices(sp, min_copy):
    """ Compute gene copy numbers for samples """
    for sample in sp.samples:
        sample.genes = {}
        for field, dtype in [('presabs', float), ('copynum', float),
                             ('depth', float), ('reads', int)]:
            sample.genes[field] = defaultdict(dtype)
        inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, sp.id)
        for r in utility.parse_file(inpath):
            if 'ref_id' in r:
                r['gene_id'] = r['ref_id']  # fix old fields if present
            if 'normalized_coverage' in r:
                r['copy_number'] = r['normalized_coverage']
            if 'raw_coverage' in r: r['coverage'] = r['raw_coverage']
            gene_id = sp.map[r['gene_id']]
            sample.genes['copynum'][gene_id] += float(r['copy_number'])
            sample.genes['depth'][gene_id] += float(r['coverage'])
            sample.genes['reads'][gene_id] += int(
                r['count_reads']) if 'count_reads' in r else 0
    for sample in sp.samples:
        for gene_id, copynum in sample.genes['copynum'].items():
            if copynum >= min_copy: sample.genes['presabs'][gene_id] = 1
            else: sample.genes['presabs'][gene_id] = 0
Exemplo n.º 31
0
def build_gene_matrices(species_id, samples, args):
	""" Compute gene copy numbers for samples """
	gene_to_family = read_gene_map(species_id, args)
	count_genes = len(gene_to_family.keys())
	count_genomes = len(set(['.'.join(x.split('.')[0:2]) for x in gene_to_family]))
	count_families = len(set(gene_to_family.values()))
	print("    %s genes from %s genomes" % (count_genes, count_genomes))
	print("    clustered into %s families at %s percent id" % (count_families, args['cluster_pid']))
	for sample in samples:
		sample.genes = {}
		for type in ['presabs', 'copynum', 'depth']:
			sample.genes[type] = defaultdict(float)
		inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, species_id)
		for r in utility.parse_file(inpath):
			if 'normalized_coverage' in r: r['copy_number'] = r['normalized_coverage']
			if 'raw_coverage' in r: r['coverage'] = r['raw_coverage']
			gene_id = gene_to_family[r['gene_id']]
			sample.genes['copynum'][gene_id] += float(r['copy_number'])
			sample.genes['depth'][gene_id] += float(r['coverage'])
	for sample in samples:
		for gene_id, copynum in sample.genes['copynum'].items():
			if copynum >= args['min_copy']: sample.genes['presabs'][gene_id] = 1
			else: sample.genes['presabs'][gene_id] = 0
Exemplo n.º 32
0
def snps_summary(args, species):
    """ Get summary of mapping statistics """
    # store stats
    stats = {}
    for species_id in species:
        genome_length, covered_bases, total_depth, maf = [0, 0, 0, 0]
        for r in utility.parse_file('/'.join(
            [args['outdir'],
             'snps/output/%s.snps.gz' % species_id])):
            genome_length += 1
            depth = int(r['depth'])
            if depth > 0:
                covered_bases += 1
                total_depth += depth
                ref_freq = float(r['ref_freq'])
                maf += ref_freq if ref_freq <= 0.5 else 1 - ref_freq
        fraction_covered = covered_bases / float(genome_length)
        mean_coverage = total_depth / float(
            covered_bases) if covered_bases > 0 else 0
        stats[species_id] = {
            'genome_length': genome_length,
            'covered_bases': covered_bases,
            'fraction_covered': fraction_covered,
            'mean_coverage': mean_coverage
        }
    # write stats
    fields = [
        'genome_length', 'covered_bases', 'fraction_covered', 'mean_coverage'
    ]
    outfile = open('/'.join([args['outdir'], 'snps/summary.txt']), 'w')
    outfile.write('\t'.join(['species_id'] + fields) + '\n')
    for species_id in stats:
        record = [species_id
                  ] + [str(stats[species_id][field]) for field in fields]
        outfile.write('\t'.join(record) + '\n')
    outfile.close()
Exemplo n.º 33
0
def list_genes(args):
    """ List the set of genes from snps_info.txt """
    genes = set([])
    for r in parse_file('%s/snps_info.txt' % args['indir']):
        if r['gene_id'] != '': genes.add(r['gene_id'])
    return genes
Exemplo n.º 34
0
def read_annotations(args):
    info = {}
    inpath = '%s/species_info.txt' % args['db']
    for r in utility.parse_file(inpath):
        info[r['species_id']] = r
    return info