示例#1
0
    opt_codons = cai.getOptimalCodons(options.species)
    relad_dict = cai.getRelativeAdaptivenessValues(options.species)
    cai_fxn = cai.getCAIFunction(options.species)
    # DAD: cai.getCAI takes only log-transformed relative adaptiveness values.
    # Here, knowing that some values are == 0, add half of the minimum nonzero value.
    # Should be using some sort of better estimator!
    min_relad_value = 0.5 * min([v for v in relad_dict.values() if v > 0.0])
    for k in relad_dict.keys():
        if relad_dict[k] <= 0.0:
            relad_dict[k] = min_relad_value
    ln_relad_dict = dict([(k, math.log(v)) for (k, v) in relad_dict.items()])

    # Assay the provided sequences
    for (id, seq) in seqs:
        line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format(
            id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq))
        info_outs.write(line)

    # If optimization is desired, do it.
    if options.optimize:
        info_outs.write("# Optimizing sequences...\n")
        gc = translate.geneticCode(rna=False)
        codons = {}
        opt_codon_dict = dict([(gc[c], c) for c in opt_codons])
        opt_codon_dict['W'] = 'TGG'
        opt_codon_dict['M'] = 'ATG'

        opt_headers = []
        opt_seqs = []
        # optimize the codon sequences
        for (id, seq) in seqs:
示例#2
0
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn,
               outfile):
    header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \
         ('\tfrac.'.join([str(b) for b in bounds],))
    outfile.write(header)
    print header,
    total_genes = 0
    total_errors = 0
    # Reconstruct genes
    for chr in chromosomes:
        print "# Chromosome", chr
        # Load this chromosome's data
        chrseq = chromosome_load_fxn(chr)
        n_genes = 0
        n_errors = 0
        n_wrong_length = 0
        n_bad_translation = 0
        n_different_translation = 0
        for (pepid, recdict) in peptide_records.items():
            recs = recdict.values()
            if recs[0].chromosome == chr:
                recs.sort(key=lambda e: e.exon_start)
                (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
                (intron_seq,
                 sentinel_rec_intron) = buildIntronSequence(recs, chrseq)
                # Reverse-complement the sequence if it's on the negative strand
                if recs[0].strand == '-1':
                    #print "reversing strand"
                    seq = translate.reverseComplement(seq)
                    intron_seq = translate.reverseComplement(intron_seq)
                n_genes += 1

                # Write out statistics
                if False:
                    print 'g', seq
                    print 'i', intron_seq
                if len(intron_seq) > 0:
                    gc_intron = '%1.4f' % cai.getGC(intron_seq)
                    intron_length = len(intron_seq)
                    frac_coding = '%1.4f' % (
                        len(seq) / (len(seq) + float(intron_length)), )
                    gcind_intron = '%1.4f' % cai.getDinucleotideIndex(
                        intron_seq, 'GC')
                    gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT')
                else:
                    gc_intron = 'NA'
                    intron_length = 0
                    gcind_intron = 'NA'
                    if len(seq) > 0:
                        frac_coding = '1.0'
                    else:
                        frac_coding = 'NA'
                num_coding_exons = len(
                    [xr for xr in recs if xr.coding_end > xr.coding_start])
                # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence
                if False:
                    (coding_outside,
                     total_coding) = codingOutsideBoundary(recs, 0)
                    print len(seq), total_coding, coding_outside
                    assert total_coding == coding_outside
                    assert coding_outside == len(seq)
                fracs_inside = []
                for bound in bounds:
                    (coding_outside,
                     total_coding) = codingOutsideBoundary(recs, bound)
                    if total_coding > 0:
                        frac_inside_bound = "%1.4f" % (
                            1 - float(coding_outside) / total_coding, )
                    else:
                        frac_inside_bound = "NA"
                    fracs_inside.append(frac_inside_bound)
                line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \
                    (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside))
                outfile.write(line)
                #print line,

        outfile.flush()
        print "# Processed %d genes with %d errors" % (n_genes, n_errors)
        print "#     %d length errors" % (n_wrong_length, )
        print "#     %d bad translation errors" % (n_bad_translation, )
        print "#     %d different translation errors" % (
            n_different_translation, )
        total_errors += n_errors
        total_genes += n_genes
        chrseqlist = None
        chrseq = None
    print "# Processed %d genes with %d errors total" % (total_genes,
                                                         total_errors)
    outfile.close()
示例#3
0
文件: exonstats.py 项目: dad/base
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile):
	header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \
				  ('\tfrac.'.join([str(b) for b in bounds],))
	outfile.write(header)
	print header,
	total_genes = 0
	total_errors = 0
	# Reconstruct genes
	for chr in chromosomes:
		print "# Chromosome", chr
		# Load this chromosome's data
		chrseq = chromosome_load_fxn(chr)
		n_genes = 0
		n_errors = 0
		n_wrong_length = 0
		n_bad_translation = 0
		n_different_translation = 0
		for (pepid, recdict) in peptide_records.items():
			recs = recdict.values()
			if recs[0].chromosome == chr:
				recs.sort( key = lambda e: e.exon_start)
				(seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0)
				(intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq)
				# Reverse-complement the sequence if it's on the negative strand
				if recs[0].strand == '-1':
					#print "reversing strand"
					seq = translate.reverseComplement(seq)
					intron_seq = translate.reverseComplement(intron_seq)
				n_genes += 1

				# Write out statistics
				if False:
					print 'g', seq
					print 'i', intron_seq
				if len(intron_seq)>0:
					gc_intron = '%1.4f' % cai.getGC(intron_seq)
					intron_length = len(intron_seq)
					frac_coding = '%1.4f' % (len(seq)/(len(seq)+float(intron_length)),)
					gcind_intron = '%1.4f' % cai.getDinucleotideIndex(intron_seq, 'GC')
					gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT')
				else:
					gc_intron = 'NA'
					intron_length = 0
					gcind_intron = 'NA'
					if len(seq)>0:
						frac_coding = '1.0'
					else:
						frac_coding = 'NA'
				num_coding_exons = len([xr for xr in recs if xr.coding_end > xr.coding_start])
				# Test to ensure agreement between codingOutsideBoundary and buildCodingSequence
				if False:
					(coding_outside, total_coding) = codingOutsideBoundary(recs, 0)
					print len(seq), total_coding, coding_outside
					assert total_coding == coding_outside
					assert coding_outside == len(seq)
				fracs_inside = []
				for bound in bounds:
					(coding_outside, total_coding) = codingOutsideBoundary(recs, bound)
					if total_coding > 0:
						frac_inside_bound = "%1.4f" % (1-float(coding_outside)/total_coding,)
					else:
						frac_inside_bound = "NA"
					fracs_inside.append(frac_inside_bound)
				line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \
					   (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside))
				outfile.write(line)
				#print line,

		outfile.flush()
		print "# Processed %d genes with %d errors" % (n_genes, n_errors)
		print "#     %d length errors" % (n_wrong_length,)
		print "#     %d bad translation errors" % (n_bad_translation,)
		print "#     %d different translation errors" % (n_different_translation,)
		total_errors += n_errors
		total_genes += n_genes
		chrseqlist = None
		chrseq = None
	print "# Processed %d genes with %d errors total" % (total_genes, total_errors)
	outfile.close()
示例#4
0
文件: codonopt.py 项目: dad/base
	info_outs.write("# Using optimal codons for {}\n".format(options.species))
	opt_codons = cai.getOptimalCodons(options.species)
	relad_dict = cai.getRelativeAdaptivenessValues(options.species)
	cai_fxn = cai.getCAIFunction(options.species)
	# DAD: cai.getCAI takes only log-transformed relative adaptiveness values.
	# Here, knowing that some values are == 0, add half of the minimum nonzero value.
	# Should be using some sort of better estimator!
	min_relad_value = 0.5 * min([v for v in relad_dict.values() if v>0.0])
	for k in relad_dict.keys():
		if relad_dict[k] <= 0.0:
			relad_dict[k] = min_relad_value
	ln_relad_dict = dict([(k,math.log(v)) for (k,v) in relad_dict.items()])

	# Assay the provided sequences
	for (id, seq) in seqs:
		line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format(id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq))
		info_outs.write(line)

	# If optimization is desired, do it.
	if options.optimize:
		info_outs.write("# Optimizing sequences...\n")
		gc = translate.geneticCode(rna=False)
		codons = {}
		opt_codon_dict = dict([(gc[c],c) for c in opt_codons])
		opt_codon_dict['W'] = 'TGG'
		opt_codon_dict['M'] = 'ATG'

		opt_headers = []
		opt_seqs = []
		# optimize the codon sequences
		for (id, seq) in seqs: