def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ out = StringIO() # heterozygous print >> out, 'heterozygous region:' a = fs.x b = fs.y + fs.z distn = DGRP.get_zygosity_distribution(a, b) print >> out, distn_to_string(distn) print >> out # homozygous print >> out, 'homozygous region:' a = fs.x + fs.y b = fs.z distn = DGRP.get_zygosity_distribution(a, b) print >> out, distn_to_string(distn) print >> out # misaligned print >> out, 'misaligned homozygous region:' a = fs.w + fs.x + fs.y b = fs.z distn = DGRP.get_zygosity_distribution(a, b) print >> out, distn_to_string(distn) return out.getvalue()
def gen_output_lines(args, fin): """ Yield observation lines, given a filtered pileup file open for reading. """ # unpack some relevant arguments reqambig = args.reqambig fill = args.fill errlow, errhigh = args.errlow, args.errhigh low = 1 if args.low == 'drosophila' else args.low high = args.high # create some state maintained across input lines filler = None chrom_name = None name_to_drosophila_length = dict(DGRP.g_chromosome_length_pairs) # define the default line to write default_obs = (0, 0, 0, 0) # process the input file line by line for line in fin: srow = line.split() if not srow: continue row = DGRP.filtered_pileup_row_to_typed(srow) obs = DGRP.filtered_pileup_typed_to_obs(row) name, pos, ref = row[:3] if filler is None: # set the chromosome name chrom_name = name # if appropriate, update the high value using the chrom name if args.high == 'drosophila': high = name_to_drosophila_length.get(name, None) if high is None: raise Exception('invalid fly chromosome: ' + name) else: high = args.high # define the filler generator object filler = iterfiller.FillerGenerator(low, high, fill, errlow, errhigh, default_obs) # check the chromosome name for consistency if name != chrom_name: raise Exception( 'conflicting chromosome ' 'names: %s %s' % (name, chrom_name)) # check for reference nucleotide weirdness if reqambig: if not filler.check_bounds(pos): if ref != 'N': raise Exception( 'expected out of bounds reference nucleotides ' 'to be N but found %s ' 'at position %d of chrom %s' % (ref, pos, name)) # process lines emitted by the filler for value in filler.fill(pos, obs): yield '\t'.join(str(x) for x in value) # process final lines emitted by the filler for value in filler.finish(): yield '\t'.join(str(x) for x in value)
def main(args): """ @param args: positional and flaglike arguments """ # read the arguments input_filename = os.path.abspath(os.path.expanduser(args.infile)) output_directory = os.path.abspath(os.path.expanduser(args.outdir)) force = args.force # make sure that the output directory exists if not os.path.isdir(output_directory): if force: os.makedirs(output_directory) if not os.path.isdir(output_directory): msg = 'output directory does not exist: ' + output_directory raise Exception(msg) # scan the input file for chromosome names ch_paths = [] skimmer = DGRP.ChromoSkimmer() with open(input_filename) as fin: for chromo_name in skimmer.skim(gen_untyped_rows(fin)): output_filename = args.out_prefix + chromo_name + args.out_suffix ch_path = os.path.join(output_directory, output_filename) ch_paths.append(ch_path) if not force: if os.path.exists(ch_path): raise Exception('output already exists: ' + ch_path) chromo_names = skimmer.name_list nlines = skimmer.linecount # start the progress bar nticks = 2 * nlines pbar = Progress.Bar(nticks) # scan the input file for correct types and for monotonicity with open(input_filename) as fin: for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)): pbar.increment() # create the files open for writing ch_files = [] for p in ch_paths: ch_files.append(open(p, 'wt')) # write the headers if not args.noheader: for f in ch_files: f.write(g_header + '\n') # write the lines name_to_file = dict(zip(chromo_names, ch_files)) with open(input_filename) as fin: for row in gen_typed_rows(fin): name = row[0] row_out = convert_row(row) f = name_to_file[name] line_out = '\t'.join(str(x) for x in row_out) f.write(line_out + '\n') pbar.increment() # close the files for f in ch_files: f.close()
def main(args): filenames = (args.out_forward, args.out_scaling, args.out_backward) # aggregate and validate the model parameters model = DGRP.Model() model.from_fieldstorage(args) # see how the states interact with the observations states = (model.get_recent_state(), model.get_ancient_state(), model.get_misaligned_state(args.misalignment_effect), model.get_garbage_state()) # define the transition object nstates = len(states) prandom = min(1.0, (nstates / (nstates - 1.0)) / args.region_size) T = TransitionMatrix.UniformTransitionObject(prandom, nstates) # make the hmm hmm = ExternalHMM.ExternalModel(T, states, filenames) converter = lineario.IntTupleConverter() o_stream = lineario.SequentialDiskIO(converter, args.obsfile) hmm.init_dp(o_stream) o_stream.open_read() for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()): p_recent, p_ancient, p_misaligned, p_garbage = p maxpost = get_maxpost(p_recent, p_ancient, p_misaligned, p_garbage) # show the annotation for this position annotation = list(obs) + list(p) + [maxpost] print '\t'.join(str(x) for x in annotation) o_stream.close()
def get_response_content(fs): distn = DGRP.get_zygosity_distribution(fs.ref_length, fs.child_length) out = StringIO() print >> out, 'p(RR):', distn[0] print >> out, 'p(RA):', distn[1] print >> out, 'p(AA):', distn[2] print >> out, 'p(AB):', distn[3] return out.getvalue()
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ out = StringIO() lines = Util.get_stripped_lines(StringIO(fs.param_field)) model = DGRP.Model() model.from_lines(lines) # see how the states interact with the observations states = (model.get_recent_state(), model.get_ancient_state(), model.get_misaligned_state(fs.misalignment_effect), model.get_garbage_state()) # define the transition object nstates = len(states) prandom = min(1.0, (nstates / (nstates - 1.0)) / fs.region_size) T = TransitionMatrix.UniformTransitionObject(prandom, nstates) # use StringIO objects for storage hmm = ExternalHMM.ExternalModel(T, states, (None, None, None)) converter = lineario.IntTupleConverter() o_stream = lineario.SequentialStringIO(converter, fs.data_field) hmm.init_dp(o_stream) o_stream.open_read() for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()): p_recent, p_ancient, p_misaligned, p_garbage = p # get the prior probability of polymorphism conditional on state p_recent_AA = states[0].get_posterior_distribution(obs)[2] p_ancient_AA = states[1].get_posterior_distribution(obs)[2] # compute the posterior probability of a polymorphism posterior_polymorphism = 0 posterior_polymorphism += p_recent * p_recent_AA posterior_polymorphism += p_ancient * p_ancient_AA # Given that a polymorphism occurred, # get the probability distribution over the # three non-reference nucleotides. r = model.seqerr log_Pr = math.log(r / 4.0) log_PA = math.log(1 - 3 * r / 4.0) logs = [ obs[1] * log_PA + obs[2] * log_Pr + obs[3] * log_Pr, obs[1] * log_Pr + obs[2] * log_PA + obs[3] * log_Pr, obs[1] * log_Pr + obs[2] * log_Pr + obs[3] * log_PA ] condmaxpost = math.exp(max(logs) - scipy.misc.logsumexp(logs)) # get the posterior probability distribution maxpost = posterior_polymorphism * condmaxpost # show the inference for this position print >> out, obs, p, maxpost o_stream.close() return out.getvalue()
def get_response_content(fs): # quickly skim the lines to get some info fin = StringIO(fs.data_in) skimmer = DGRP.ChromoSkimmer() for chromo_name in skimmer.skim(gen_untyped_rows(fin)): pass chromo_names = skimmer.name_list nlines = skimmer.linecount # check formatting and monotonicity fin = StringIO(fs.data_in) for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)): pass # begin writing out = StringIO() print >> out, 'writing the first of', len(chromo_names), 'chromosomes:' print >> out # write only the first chromosome fin = StringIO(fs.data_in) print >> out, g_header for row in gen_typed_rows(fin): name = row[0] if name == chromo_names[0]: print >> out, '\t'.join(str(x) for x in convert_row(row)) return out.getvalue()
def gen_named_observations(self, fin): """ Yield (chrom_name, observation) pairs @param fin: a file open for reading """ default_value = None # Process each row of the input file, # yielding after each written line. for row in gen_typed_rows(fin): name, position = row[0], row[1] value = DGRP.filtered_pileup_typed_to_obs(row) fg = self.name_to_generator[name] for obs in fg.fill(position, value): yield name, obs for name, fg in self.name_to_generator.items(): for obs in fg.finish(): yield name, obs
def main(args): filenames = (args.out_forward, args.out_scaling, args.out_backward) # aggregate and validate the model parameters model = DGRP.Model() model.from_fieldstorage(args) # see how the states interact with the observations states = (model.get_recent_state(), model.get_ancient_state(), model.get_misaligned_state(args.misalignment_effect), model.get_garbage_state()) # define the transition object nstates = len(states) prandom = min(1.0, (nstates / (nstates - 1.0)) / args.region_size) T = TransitionMatrix.UniformTransitionObject(prandom, nstates) # make the hmm hmm = ExternalHMM.ExternalModel(T, states, filenames) converter = lineario.IntTupleConverter() o_stream = lineario.SequentialDiskIO(converter, args.obsfile) hmm.init_dp(o_stream) o_stream.open_read() for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()): p_recent, p_ancient, p_misaligned, p_garbage = p # get the prior probability of polymorphism conditional on state p_recent_AA = states[0].get_posterior_distribution(obs)[2] p_ancient_AA = states[1].get_posterior_distribution(obs)[2] # compute the posterior probability of a polymorphism posterior_polymorphism = 0 posterior_polymorphism += p_recent * p_recent_AA posterior_polymorphism += p_ancient * p_ancient_AA # Given that a polymorphism occurred, # get the probability distribution over the # three non-reference nucleotides. r = model.seqerr log_Pr = math.log(r / 4.0) log_PA = math.log(1 - 3 * r / 4.0) logs = [ obs[1] * log_PA + obs[2] * log_Pr + obs[3] * log_Pr, obs[1] * log_Pr + obs[2] * log_PA + obs[3] * log_Pr, obs[1] * log_Pr + obs[2] * log_Pr + obs[3] * log_PA ] condmaxpost = math.exp(max(logs) - scipy.misc.logsumexp(logs)) # get the posterior probability distribution maxpost = posterior_polymorphism * condmaxpost # show the annotation for this position annotation = list(obs) + list(p) + [maxpost] print '\t'.join(str(x) for x in annotation) o_stream.close()
def __init__(self, dref, dchild, seqerr, nomcoverage, kmulticoverages): """ @param dref: a branch length parameter @param dchild: a branch length parameter @param seqerr: probability of sequence randomization @param nomcoverage: nominal coverage @param kmulticoverages: allowed multiples of nominal coverage """ mcov = GoodMultiCoverage(nomcoverage, kmulticoverages) # define the states r = seqerr RR_states = [SinglePatternState(d, mcov) for d in gen_RR_distns(r)] RA_states = [SinglePatternState(d, mcov) for d in gen_RA_distns(r)] AA_states = [SinglePatternState(d, mcov) for d in gen_AA_distns(r)] AB_states = [SinglePatternState(d, mcov) for d in gen_AB_distns(r)] # define the distributions RR = ReadCoverage.UniformMixture(RR_states) RA = ReadCoverage.UniformMixture(RA_states) AA = ReadCoverage.UniformMixture(AA_states) AB = ReadCoverage.UniformMixture(AB_states) states = (RR, RA, AA, AB) zygo_distn = DGRP.get_zygosity_distribution(dref, dchild) ReadCoverage.Mixture.__init__(self, states, zygo_distn)
def gen_typed_rows(fin): for line in fin: srow = line.split() if srow: yield DGRP.filtered_pileup_row_to_typed(srow)