示例#1
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    out = StringIO()
    # heterozygous
    print >> out, 'heterozygous region:'
    a = fs.x
    b = fs.y + fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    print >> out
    # homozygous
    print >> out, 'homozygous region:'
    a = fs.x + fs.y
    b = fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    print >> out
    # misaligned
    print >> out, 'misaligned homozygous region:'
    a = fs.w + fs.x + fs.y
    b = fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    return out.getvalue()
示例#2
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    out = StringIO()
    # heterozygous
    print >> out, 'heterozygous region:'
    a = fs.x
    b = fs.y + fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    print >> out
    # homozygous
    print >> out, 'homozygous region:'
    a = fs.x + fs.y
    b = fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    print >> out
    # misaligned
    print >> out, 'misaligned homozygous region:'
    a = fs.w + fs.x + fs.y
    b = fs.z
    distn = DGRP.get_zygosity_distribution(a, b)
    print >> out, distn_to_string(distn)
    return out.getvalue()
示例#3
0
def gen_output_lines(args, fin):
    """
    Yield observation lines, given a filtered pileup file open for reading.
    """
    # unpack some relevant arguments
    reqambig = args.reqambig
    fill = args.fill
    errlow, errhigh = args.errlow, args.errhigh
    low = 1 if args.low == 'drosophila' else args.low
    high = args.high
    # create some state maintained across input lines
    filler = None
    chrom_name = None
    name_to_drosophila_length = dict(DGRP.g_chromosome_length_pairs)
    # define the default line to write
    default_obs = (0, 0, 0, 0)
    # process the input file line by line
    for line in fin:
        srow = line.split()
        if not srow:
            continue
        row = DGRP.filtered_pileup_row_to_typed(srow)
        obs = DGRP.filtered_pileup_typed_to_obs(row)
        name, pos, ref = row[:3]
        if filler is None:
            # set the chromosome name
            chrom_name = name
            # if appropriate, update the high value using the chrom name
            if args.high == 'drosophila':
                high = name_to_drosophila_length.get(name, None)
                if high is None:
                    raise Exception('invalid fly chromosome: ' + name)
            else:
                high = args.high
            # define the filler generator object
            filler = iterfiller.FillerGenerator(low, high,
                    fill, errlow, errhigh, default_obs)
        # check the chromosome name for consistency
        if name != chrom_name:
            raise Exception(
                    'conflicting chromosome '
                    'names: %s %s' % (name, chrom_name))
        # check for reference nucleotide weirdness
        if reqambig:
            if not filler.check_bounds(pos):
                if ref != 'N':
                    raise Exception(
                            'expected out of bounds reference nucleotides '
                            'to be N but found %s '
                            'at position %d of chrom %s' % (ref, pos, name))
        # process lines emitted by the filler
        for value in filler.fill(pos, obs):
            yield '\t'.join(str(x) for x in value)
    # process final lines emitted by the filler
    for value in filler.finish():
        yield '\t'.join(str(x) for x in value)
示例#4
0
def main(args):
    """
    @param args: positional and flaglike arguments
    """
    # read the arguments
    input_filename = os.path.abspath(os.path.expanduser(args.infile))
    output_directory = os.path.abspath(os.path.expanduser(args.outdir))
    force = args.force
    # make sure that the output directory exists
    if not os.path.isdir(output_directory):
        if force:
            os.makedirs(output_directory)
    if not os.path.isdir(output_directory):
        msg = 'output directory does not exist: ' + output_directory
        raise Exception(msg)
    # scan the input file for chromosome names
    ch_paths = []
    skimmer = DGRP.ChromoSkimmer()
    with open(input_filename) as fin:
        for chromo_name in skimmer.skim(gen_untyped_rows(fin)):
            output_filename = args.out_prefix + chromo_name + args.out_suffix
            ch_path = os.path.join(output_directory, output_filename)
            ch_paths.append(ch_path)
            if not force:
                if os.path.exists(ch_path):
                    raise Exception('output already exists: ' + ch_path)
    chromo_names = skimmer.name_list
    nlines = skimmer.linecount
    # start the progress bar
    nticks = 2 * nlines
    pbar = Progress.Bar(nticks)
    # scan the input file for correct types and for monotonicity
    with open(input_filename) as fin:
        for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)):
            pbar.increment()
    # create the files open for writing
    ch_files = []
    for p in ch_paths:
        ch_files.append(open(p, 'wt'))
    # write the headers
    if not args.noheader:
        for f in ch_files:
            f.write(g_header + '\n')
    # write the lines
    name_to_file = dict(zip(chromo_names, ch_files))
    with open(input_filename) as fin:
        for row in gen_typed_rows(fin):
            name = row[0]
            row_out = convert_row(row)
            f = name_to_file[name]
            line_out = '\t'.join(str(x) for x in row_out)
            f.write(line_out + '\n')
            pbar.increment()
    # close the files
    for f in ch_files:
        f.close()
示例#5
0
def main(args):
    filenames = (args.out_forward, args.out_scaling, args.out_backward)
    # aggregate and validate the model parameters
    model = DGRP.Model()
    model.from_fieldstorage(args)
    # see how the states interact with the observations
    states = (model.get_recent_state(), model.get_ancient_state(),
              model.get_misaligned_state(args.misalignment_effect),
              model.get_garbage_state())
    # define the transition object
    nstates = len(states)
    prandom = min(1.0, (nstates / (nstates - 1.0)) / args.region_size)
    T = TransitionMatrix.UniformTransitionObject(prandom, nstates)
    # make the hmm
    hmm = ExternalHMM.ExternalModel(T, states, filenames)
    converter = lineario.IntTupleConverter()
    o_stream = lineario.SequentialDiskIO(converter, args.obsfile)
    hmm.init_dp(o_stream)
    o_stream.open_read()
    for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()):
        p_recent, p_ancient, p_misaligned, p_garbage = p
        maxpost = get_maxpost(p_recent, p_ancient, p_misaligned, p_garbage)
        # show the annotation for this position
        annotation = list(obs) + list(p) + [maxpost]
        print '\t'.join(str(x) for x in annotation)
    o_stream.close()
示例#6
0
def get_response_content(fs):
    distn = DGRP.get_zygosity_distribution(fs.ref_length, fs.child_length)
    out = StringIO()
    print >> out, 'p(RR):', distn[0]
    print >> out, 'p(RA):', distn[1]
    print >> out, 'p(AA):', distn[2]
    print >> out, 'p(AB):', distn[3]
    return out.getvalue()
示例#7
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    out = StringIO()
    lines = Util.get_stripped_lines(StringIO(fs.param_field))
    model = DGRP.Model()
    model.from_lines(lines)
    # see how the states interact with the observations
    states = (model.get_recent_state(), model.get_ancient_state(),
              model.get_misaligned_state(fs.misalignment_effect),
              model.get_garbage_state())
    # define the transition object
    nstates = len(states)
    prandom = min(1.0, (nstates / (nstates - 1.0)) / fs.region_size)
    T = TransitionMatrix.UniformTransitionObject(prandom, nstates)
    # use StringIO objects for storage
    hmm = ExternalHMM.ExternalModel(T, states, (None, None, None))
    converter = lineario.IntTupleConverter()
    o_stream = lineario.SequentialStringIO(converter, fs.data_field)
    hmm.init_dp(o_stream)
    o_stream.open_read()
    for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()):
        p_recent, p_ancient, p_misaligned, p_garbage = p
        # get the prior probability of polymorphism conditional on state
        p_recent_AA = states[0].get_posterior_distribution(obs)[2]
        p_ancient_AA = states[1].get_posterior_distribution(obs)[2]
        # compute the posterior probability of a polymorphism
        posterior_polymorphism = 0
        posterior_polymorphism += p_recent * p_recent_AA
        posterior_polymorphism += p_ancient * p_ancient_AA
        # Given that a polymorphism occurred,
        # get the probability distribution over the
        # three non-reference nucleotides.
        r = model.seqerr
        log_Pr = math.log(r / 4.0)
        log_PA = math.log(1 - 3 * r / 4.0)
        logs = [
            obs[1] * log_PA + obs[2] * log_Pr + obs[3] * log_Pr,
            obs[1] * log_Pr + obs[2] * log_PA + obs[3] * log_Pr,
            obs[1] * log_Pr + obs[2] * log_Pr + obs[3] * log_PA
        ]
        condmaxpost = math.exp(max(logs) - scipy.misc.logsumexp(logs))
        # get the posterior probability distribution
        maxpost = posterior_polymorphism * condmaxpost
        # show the inference for this position
        print >> out, obs, p, maxpost
    o_stream.close()
    return out.getvalue()
示例#8
0
def get_response_content(fs):
    # quickly skim the lines to get some info
    fin = StringIO(fs.data_in)
    skimmer = DGRP.ChromoSkimmer()
    for chromo_name in skimmer.skim(gen_untyped_rows(fin)):
        pass
    chromo_names = skimmer.name_list
    nlines = skimmer.linecount
    # check formatting and monotonicity
    fin = StringIO(fs.data_in)
    for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)):
        pass
    # begin writing
    out = StringIO()
    print >> out, 'writing the first of', len(chromo_names), 'chromosomes:'
    print >> out
    # write only the first chromosome
    fin = StringIO(fs.data_in)
    print >> out, g_header
    for row in gen_typed_rows(fin):
        name = row[0]
        if name == chromo_names[0]:
            print >> out, '\t'.join(str(x) for x in convert_row(row))
    return out.getvalue()
示例#9
0
 def gen_named_observations(self, fin):
     """
     Yield (chrom_name, observation) pairs
     @param fin: a file open for reading
     """
     default_value = None
     # Process each row of the input file,
     # yielding after each written line.
     for row in gen_typed_rows(fin):
         name, position = row[0], row[1]
         value = DGRP.filtered_pileup_typed_to_obs(row)
         fg = self.name_to_generator[name]
         for obs in fg.fill(position, value):
             yield name, obs
     for name, fg in self.name_to_generator.items():
         for obs in fg.finish():
             yield name, obs
示例#10
0
 def gen_named_observations(self, fin):
     """
     Yield (chrom_name, observation) pairs
     @param fin: a file open for reading
     """
     default_value = None
     # Process each row of the input file,
     # yielding after each written line.
     for row in gen_typed_rows(fin):
         name, position = row[0], row[1]
         value = DGRP.filtered_pileup_typed_to_obs(row)
         fg = self.name_to_generator[name]
         for obs in fg.fill(position, value):
             yield name, obs
     for name, fg in self.name_to_generator.items():
         for obs in fg.finish():
             yield name, obs
示例#11
0
def main(args):
    filenames = (args.out_forward, args.out_scaling, args.out_backward)
    # aggregate and validate the model parameters
    model = DGRP.Model()
    model.from_fieldstorage(args)
    # see how the states interact with the observations
    states = (model.get_recent_state(), model.get_ancient_state(),
              model.get_misaligned_state(args.misalignment_effect),
              model.get_garbage_state())
    # define the transition object
    nstates = len(states)
    prandom = min(1.0, (nstates / (nstates - 1.0)) / args.region_size)
    T = TransitionMatrix.UniformTransitionObject(prandom, nstates)
    # make the hmm
    hmm = ExternalHMM.ExternalModel(T, states, filenames)
    converter = lineario.IntTupleConverter()
    o_stream = lineario.SequentialDiskIO(converter, args.obsfile)
    hmm.init_dp(o_stream)
    o_stream.open_read()
    for p, obs in itertools.izip(hmm.posterior(), o_stream.read_forward()):
        p_recent, p_ancient, p_misaligned, p_garbage = p
        # get the prior probability of polymorphism conditional on state
        p_recent_AA = states[0].get_posterior_distribution(obs)[2]
        p_ancient_AA = states[1].get_posterior_distribution(obs)[2]
        # compute the posterior probability of a polymorphism
        posterior_polymorphism = 0
        posterior_polymorphism += p_recent * p_recent_AA
        posterior_polymorphism += p_ancient * p_ancient_AA
        # Given that a polymorphism occurred,
        # get the probability distribution over the
        # three non-reference nucleotides.
        r = model.seqerr
        log_Pr = math.log(r / 4.0)
        log_PA = math.log(1 - 3 * r / 4.0)
        logs = [
            obs[1] * log_PA + obs[2] * log_Pr + obs[3] * log_Pr,
            obs[1] * log_Pr + obs[2] * log_PA + obs[3] * log_Pr,
            obs[1] * log_Pr + obs[2] * log_Pr + obs[3] * log_PA
        ]
        condmaxpost = math.exp(max(logs) - scipy.misc.logsumexp(logs))
        # get the posterior probability distribution
        maxpost = posterior_polymorphism * condmaxpost
        # show the annotation for this position
        annotation = list(obs) + list(p) + [maxpost]
        print '\t'.join(str(x) for x in annotation)
    o_stream.close()
示例#12
0
 def __init__(self, dref, dchild, seqerr, nomcoverage, kmulticoverages):
     """
     @param dref: a branch length parameter
     @param dchild: a branch length parameter
     @param seqerr: probability of sequence randomization
     @param nomcoverage: nominal coverage
     @param kmulticoverages: allowed multiples of nominal coverage
     """
     mcov = GoodMultiCoverage(nomcoverage, kmulticoverages)
     # define the states
     r = seqerr
     RR_states = [SinglePatternState(d, mcov) for d in gen_RR_distns(r)]
     RA_states = [SinglePatternState(d, mcov) for d in gen_RA_distns(r)]
     AA_states = [SinglePatternState(d, mcov) for d in gen_AA_distns(r)]
     AB_states = [SinglePatternState(d, mcov) for d in gen_AB_distns(r)]
     # define the distributions
     RR = ReadCoverage.UniformMixture(RR_states)
     RA = ReadCoverage.UniformMixture(RA_states)
     AA = ReadCoverage.UniformMixture(AA_states)
     AB = ReadCoverage.UniformMixture(AB_states)
     states = (RR, RA, AA, AB)
     zygo_distn = DGRP.get_zygosity_distribution(dref, dchild)
     ReadCoverage.Mixture.__init__(self, states, zygo_distn)
示例#13
0
def gen_typed_rows(fin):
    for line in fin:
        srow = line.split()
        if srow:
            yield DGRP.filtered_pileup_row_to_typed(srow)
示例#14
0
def gen_typed_rows(fin):
    for line in fin:
        srow = line.split()
        if srow:
            yield DGRP.filtered_pileup_row_to_typed(srow)