Пример #1
0
def get_scaff_from_minimus(contig):
    '''takes <contig> filename, returns GFF.File of scaffolding, e.g.
	
	LS146000       .       .       1       946     .       -       .       contig_end=1904;contig=8;contig_start=1004
	'''

    contigs = GFF.File()

    for l in open(contig):
        match = re.search(
            '#(?P<seqid>.+?)\(\d+\)\s(?P<strand>.+?)\s.+?\{(?P<sstart>\d+)\s(?P<send>\d+)\}\s\<(?P<cstart>\d+)\s(?P<cend>\d+)\>',
            l)
        if l.startswith('##'):
            num = l.split()[0][2:]
        elif match:
            m = match.groupdict()
            r = GFF.Region()
            r['seqid'] = m['seqid']
            r['start'], r['end'] = [
                str(s) for s in sorted((int(m['sstart']), int(m['send'])))
            ]
            r['attribute_contig'] = num
            r['attribute_cstart'], r['attribute_cend'] = [
                str(s) for s in sorted((int(m['cstart']), int(m['cend'])))
            ]
            if m['strand'] == '[RC]':
                r['strand'] = '-'
            else:
                r['strand'] = '+'
            contigs.append(r)
    return contigs
Пример #2
0
def main(gmap_filename):
    from collections import defaultdict
    output = []
    CT = defaultdict(lambda: ClusterTree(0,0))
    for r in GFF.gmapGFFReader(gmap_filename):
        if r.coverage >= 95. and all(x>=80 for x in r.scores):
            output.append(r)
            i = len(output)-1
            CT[r.chr].insert(r.start, r.end, i)

    f = open(gmap_filename+'.cov95score80_consolidated_UCSC.gff', 'w')
    for ct in CT.itervalues():
        for a,b,record_indices in ct.getregions():
            records = [output[ind] for ind in record_indices]
           # continuously merge potentially compatiable records
            i = 0
            while i < len(records)-1:
                j = i + 1
                while j < len(records) and records[j].start < records[i].end:
                    if consolidate(records[i], records[j]) is None: j += 1
                    else: records.pop(j) # j is merged into i, delete
                i += 1
            for r in records:
                GFF.write_GFF_UCSCformat(f, r)
    
    f.close()
def main(cons_filename, in_filename):
    run_gmap(cons_filename)
    run_gmap(in_filename)

    cons = GFF.gmapGFFReader(cons_filename+'.gff').next()
    good, bad = 0, []
    for r in GFF.gmapGFFReader(in_filename+'.gff'):
        if compare_gff(cons, r): good += 1
        else: bad.append(r.seqid)
    
    print "{0}/{1} agree with consensus".format(good, good+len(bad))
    print "Disagreement:"
    for x in bad: print x
    return good*1./(good+len(bad))
Пример #4
0
def main():
    global OUT_FH
    infile = 'c_elegans.PRJNA13758.WS261.canonical_geneset.gtf'
    current_gene = None
    current_transcript = None
    transcript_count = 0
    for i,line in enumerate(open(infile)):
        # things to skip
        if line.startswith('#'): continue
        if line.startswith('MtDNA'): continue

        # refer to line if parsing fails
        try:
            gff = GFF.GFFParser.parseLine(line, exclude_features=['five_prime_utr', 'three_prime_utr', 'CDS','start_codon','stop_codon'])
        except:
            print >>sys.stderr, "failed on line %d:%s" % (i,line)
            raise
        if not gff: # False when feature is excluded (this allows the bypassing of parse_attr, which is expensive)
            continue

        # file is arranged by gene|transcript1[,transcript2,...]
        # so transcripts may be assumed to appear in groups
        if gff['name'] == 'transcript':
            #if current_transcript is not None: process_transcript(current_transcript)
            current_transcript = GFF.Transcript(gff)
            transcript_count += 1

            if current_gene is not None: 
                current_gene.add_transcript( current_transcript )

            # break if debugging
            if howmany and transcript_count > howmany: break
            
        elif gff['name'] == 'gene':
            if current_gene is not None:
                #print current_gene, len(current_gene)
                process_gene_model( current_gene )
                """
                for ptuple in list(current_gene.get_promoters(300)):
                    plist = list(ptuple)
                    plist[0] = "chr" + plist[0]
                    print " ".join(map(str,plist))
                """
            current_gene = GFF.Gene(gff)

        else: # add any other gff type (exon,CDS,etc) to current transcript
            current_transcript.append(gff)

    OUT_FH.close()
def main():
    GTFgen = GFF.parse(GTFfile)
    GFFlist = makeGFFlist(GTFgen)
    ucscIDlist, transcriptdict = build_stopcodon_table(GFFlist,
                                                       inculde_noncanon_start,
                                                       include_noncanon_stop)
    write_utr_stopcodon_csvfile(ucscIDlist, transcriptdict)
Пример #6
0
def draw_bs_plot(sites,sp_order,site_styles,seq_lens,offsets=None,maxheight=0.8,minheight=0.4,
                 fig=1,subpl=111,clear_plot=True,filename=None,**figargs):

    by_factor = dict(zip(set([r['source'] for r in sites]),[GFF.File() for i in set([r['source'] for r in sites])]))

    for r in sites:
        cut = site_styles[r['source']]['cut']
        if r['score'] < cut and r['seqid'] in sp_order:
            by_factor[r['source']].append(r)


    print by_factor

    for k,v in by_factor.items():
        normscores = Util.normalize([r['score'] for r in v],minheight,maxheight,to_abs=1)
        for i,vn in enumerate(normscores):
            by_factor[k][i]['score'] = vn

    sites_to_plot = []
    for f in by_factor.values():
        sites_to_plot.extend(f)
            
    figo = pylab.figure(fig,**figargs)
    if clear_plot:
        figo.clf()
        figo = pylab.figure(fig,**figargs)

    ax = figo.add_subplot(subpl)
    ax.set_yticks([])

    #calc offsets, draw lines
    if offsets is None:
        offsets = [None]*(len(sp_order)+1)
        midpt = max([v for k,v in seq_lens.items() if k in sp_order])/2
    for i,sp in enumerate(sp_order):
        rank = len(sp_order) - i
        if offsets[rank] is None:
            off = midpt - seq_lens[sp]/2
            offsets[rank] = off
            print off,rank,seq_lens[sp]+off,rank
        ax.text(5,rank,sp)
        ax.add_line(matplotlib.lines.Line2D((offsets[rank],seq_lens[sp]+offsets[rank]),(rank,rank),color='k',alpha=0.25,lw=5))
        

    for site in sites_to_plot:
        fc = site_styles[site['source']]['color']
        ec = fc
        rank = len(sp_order) - sp_order.index(site['seqid'])
        ax.add_patch(matplotlib.patches.Ellipse( (site['start']+offsets[rank],rank),
                                                   len(site),
                                                   site['score'],
                                                   fc=fc,ec=ec,alpha=site['score'] )
                     )
    
    if filename:
        ax.autoscale_view()
        figo.savefig(filename)
    else:
        pylab.plot()
Пример #7
0
def main(gff_file, fasta_file):
    out_file = "%s.gbk" % os.path.splitext(gff_file)[0]
    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
    gff_iter = GFF.parse(gff_file, fasta_input)
    record = next(_check_gff(_fix_ncbi_id(gff_iter)))
    ## hack to fix bug where DNAAlphabet turns into SingleLetterAlphabet() in the parser..
    record.seq.alphabet = generic_dna
    SeqIO.write(record, out_file, "genbank")
Пример #8
0
def writegene_wf2(shift5, shift3, riboshift, densityfile, feature, gfffile,
                  utrgfffilename, outfile):
    GFFgen = GFF.parse(gfffile)
    counts1 = readcountsf(densityfile + "_plus_")
    counts2 = readcountsf(densityfile + "_minus_")
    counts = [counts1, counts2]
    idtable = makeidtable2(GFFgen)
    GFFgen = GFF.parse(gfffile)
    GFFlist = makeGFFlist(GFFgen)
    goodgenes = 2
    print feature
    if utrgfffilename == "-1": utrtable = {}
    else:
        utrtable = utrgffgen = GFF.parse(utrgfffilename)
        utrtable = genometools.makeutrtable(utrgffgen)
    bp1 = shift5
    chromosome = idtable[feature][2]
    featurenum = idtable[feature][1]
    longfeature = idtable[feature][0].id

    if utrtable.has_key(longfeature):
        bp2 = utrtable[longfeature][1] - utrtable[longfeature][0] + shift3
    else:
        bp2 = 0 + shift3

    bp = [bp1, bp2, riboshift]
    retval = givegene(chromosome, featurenum, GFFlist, counts, bp, goodgenes)
    if retval[0] == -1: print "Not a good gene..."

    t = []
    t.append(["pos", "rpm"])
    i = -shift5
    while i < len(retval[0]) - shift5:
        newline = [i, retval[0][i + shift5]]
        t.append(newline)
        i += 1

    fcsv = open(outfile + "_" + feature + ".csv", "w")
    writer = csv.writer(fcsv)
    writer.writerows(t)
    fcsv.close()
Пример #9
0
def main():
    ### not sure what I was using the for here...
    # # mRNAdict = pandas.read_csv(mRNAseqsInfile, index_col=0, skiprows=1).T.to_dict()
    # mRNAdict = pandas.read_csv(mRNAseqsInfile, index_col=0, skiprows=1)
    # print mRNAdict.head()
    # # print mRNAdict['']

    ### The actual funciton:
    GTFgen = GFF.parse(GTFfile)
    GFFlist = makeGFFlist(GTFgen)
    ucscIDlist, transcriptdict = get_Prot_sequence(GFFlist)
    write_utr_stopcodon_csvfile(ucscIDlist, transcriptdict)
Пример #10
0
def makeGFFlist(GFFname):
    """ Tool for loading the entire yeast genome into memory
    From seqtools
    Called st.makeGFFlist(GFF.parse(codingGFF))
    Returns dictionary GFFlist[chr.id] = chr for chr in GFFgen
    Called for main coding GFF but not utr5GFF and utr3GFF -- generalize?
    Will this be affected it GFF is changed? Probably not, no parsing here, just storing
    """
    GFFlist = {}
    for chr in GFF.parse(GFFname):
        GFFlist[chr.id] = chr
    return GFFlist
Пример #11
0
def parse_GFF(utrGFF):
    """ Very thin wrapper that tries to do makeutrtable(GFF.parse(utrGFF))
        but checks for IOError and returns an empty dictionary instead.
    """
    try:
        return makeutrtable(GFF.parse(utrGFF))
    except IOError:
        print "Warning! " + utrGFF + " couldn't be found."
        if raw_input("'c' to continue with empty dictionary\n") == 'c':
            return {}
        else:
            quit()
Пример #12
0
def plot_density(pickle_filename, bam_filename):
    print "Plotting density along alternative isoform"
    print "  - pickle: %s" %(pickle_filename)
    print "  - BAM: %s" %(bam_filename)
    gff_genes = gff_utils.load_indexed_gff_file(pickle_filename)
    #bamfile = pysam.Samfile(bam_filename, "rb")

    plot_gene(gff_genes)

    plt.show()

    print "gff_genes: ", gff_genes
def main(gmap_filename, fasta_filename):
    """
    Given a GMAP output (.gff) compare the aligned start/end
    to Gencode annotations (transcript & polyA)

    Need the original fasta to get sequence length
    """
    seqlen_dict = dict([(r.id,len(r.seq)) for r in SeqIO.parse(open(fasta_filename),'fasta')])
    gtf_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.annotation.gtf'
    gtfA_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.polyAs.gtf'

    gtf = GFF.GTF(gtf_f)
    gtfA = GFF.polyAGFF(gtfA_f)

    f = open(gmap_filename+'.summary', 'w')
    f.write("ID\thit5_exon\thit5_dist\thit5_id\thit3_exon\thit3_dist\thit3_id\thitA_dist\n")
    reader = GFF.gmapGFFReader(gmap_filename)
    while True:
        try:
            r = reader.next()
        except AssertionError: #ignore bad gmap output
            continue
        except StopIteration:
            break
        except:
            continue
        if r.coverage < min_coverage: continue
        # IMPORTANT! if r.start/r.end is not complete, extend it!
        r_start_corrected = r.start - r.seq_exons[0].start
        r_end_corrected = r.end + (seqlen_dict[r.seqid] - r.seq_exons[-1].end)
        hit5, hit3 = validate_53seen(gtf, r.chr, r.start, r.end, r.strand)
        hitA = validate_polyA(gtfA, r.chr, r.start, r.end, r.strand)
        f.write("{id}\t{e5}\t{d5}\t{i5}\t{e3}\t{d3}\t{i3}\t{dA}\n".format(\
                id=r.seqid, e5=hit5[0], d5=hit5[1], i5=hit5[2], e3=hit3[0], d3=hit3[1], i3=hit3[2], dA=hitA))

    f.close()
def test_random_subset_gcon(records, iterations=10, size=10):
    for iter in xrange(iterations):
        picked = random.sample(records, size)
        f = open("tmp.fa", "w")
        for r in picked:
            f.write(">{0}\n{1}\n".format(r.id, r.seq))
        f.close()
        if os.path.exists("tmp_cons.fa"):
            os.remove("tmp_cons.fa")
        if os.path.exists("tmp_cons.fa.gff"):
            os.remove("tmp_cons.fa.gff")
        os.system("ice_pbdagcon.py tmp.fa tmp_cons tmp_cons --nproc 12")
        run_gmap("tmp_cons.fa")
        r2 = GFF.gmapGFFReader("tmp_cons.fa.gff").next()
        yield r2
Пример #15
0
def process_contig(dirname, f):
    d = defaultdict(lambda: []) # path_number --> (start, end, scaffold)
    reader = GFF.gmapGFFReader(os.path.join(dirname,'aloha2.fa.cuttlefish.gff'))
    for r in reader:
        if r.strand == '+': s,e = r.seq_exons[0].start, r.seq_exons[-1].end
        else: s,e = r.seq_exons[-1].start, r.seq_exons[0].end
        d[r.seqid].append((s, e, r.chr, r.start, r.end))

    if len(d) == 0: return
    
    for path_i, x in d.iteritems():
        x.sort(key=lambda x: x[0])
        xx = [_chr for s,e,_chr,_chr_s,_chr_e in x]
        f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx)))
        for s,e,_chr,_chr_s,_chr_e in x:
            f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format(path_i,s,e,_chr,_chr_s,_chr_e))
Пример #16
0
def process_contig(dirname, f):
    d = defaultdict(lambda: [])  # path_number --> (start, end, scaffold)
    reader = GFF.gmapGFFReader(
        os.path.join(dirname, 'aloha2.fa.cuttlefish.gff'))
    for r in reader:
        if r.strand == '+': s, e = r.seq_exons[0].start, r.seq_exons[-1].end
        else: s, e = r.seq_exons[-1].start, r.seq_exons[0].end
        d[r.seqid].append((s, e, r.chr, r.start, r.end))

    if len(d) == 0: return

    for path_i, x in d.iteritems():
        x.sort(key=lambda x: x[0])
        xx = [_chr for s, e, _chr, _chr_s, _chr_e in x]
        f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx)))
        for s, e, _chr, _chr_s, _chr_e in x:
            f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format(
                path_i, s, e, _chr, _chr_s, _chr_e))
Пример #17
0
def read_cogent2_aligned_to_genome_gff(filename):
    """
    Read cogent2 mapped to a genome.

    Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

    NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
    """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
Пример #18
0
def read_cogent2_aligned_to_genome_gff(filename):
    """
    Read cogent2 mapped to a genome.

    Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

    NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
    """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
Пример #19
0
def compute_gene_psi(gene_ids,
                     gff_index_filename,
                     bam_filename,
                     output_dir,
                     read_len,
                     overhang_len,
                     paired_end=None,
                     event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." % (num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" % (gff_index_filename)
    print "  - BAM: %s" % (bam_filename)
    print "  - Outputting to: %s" % (output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()

    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF


#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    #    t2 = time.time()
    #    print "  - Loading took: %.2f seconds" %(t2 - t1)

    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(
            gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None

        if settings and "sam_template" in settings:
            template = settings["sam_template"]

        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads,
                                                gene_obj,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(
                num_isoforms, mean_frag_len, frag_variance, read_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(
                num_isoforms, read_len, overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)

        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters,
                            reads,
                            gene_obj,
                            hyperparameters,
                            sampler_params,
                            output_filename,
                            burn_in=burn_in,
                            lag=lag)
Пример #20
0
gffSource = 'genscan'
gffClass = 'Genscan'
featureType = 'gene'
subfeatureType = 'exon'

for gene in data:
    name = gene[0]['gene.exon'].split('.')[0]
    output = []
    extrema = []
    for exon in gene:
        if exon['type'] in ['Init','Intr','Term','Sngl']:
            output.append(GFF.output(
                reference = reference,
                source = gffSource,
                type = subfeatureType,
                start = exon['start']+offset-1,
                end = exon['end']+offset-1,
                strand = exon['strand'],
                score = exon['score'],
                group = '%s %s' % (gffClass, name)
            ))
            extrema.append(exon['start'])
            extrema.append(exon['end'])

    output.insert(0, GFF.output(
        reference = 'scaffold_42',
        source = gffSource,
        type = featureType,
        start = min(extrema)+offset-1,
        end = max(extrema)+offset-1,
        strand = gene[0]['strand'],
        group = '%s %s' % (gffClass, name)
Пример #21
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    ##
    ## Two isoform Psi
    ##
    parser.add_option("--compute-two-iso-psi", dest="two_iso_psi_files", nargs=2, default=None,
		       help="Compute Psi using MISO for a given set of two-isoform events. "
                       "Expects two arguments: the first is the set of events (in JSON/Pickle format), "
                       "the second is an output directory where estimated Psi values will "
		       "be outputted.")

    ##
    ## Multiple isoform Psi
    ##
    # parser.add_option("--compute-multi-iso-psi", dest="multi_iso_psi_files", nargs=3, default=None,
    #                   help="Compute Psi using for a given multi-isoform gene.  Expects three arguments: "
    #                   "the first is a file with the isoform lengths. The second is a file with the reads " 
    #                   "aligned to the isoform. The third is an output directory.")
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi", nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene.  Expects four arguments: "
                      "the first is a gene ID or set of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard deviation "
                      "for the fragment length distribution (assumed to have discretized "
                      "normal form.)")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None,
		      help="Compute comparison statistics between the two given samples. "
                      "Expects three directories: the first is sample1's MISO output, "
                      "the second is sample2's MISO output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--run-two-iso-event", dest="run_two_iso_event", nargs=3, default=None,
		      help="Run MISO on two isoform event, given an event name, an events file "
                      "(in JSON/Pickle format) and an output directory.")
    parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None,
		      help="Compute summary statistics of the given set of samples. "
                      "Expects a directory with MISO output and a directory to output "
                      "summary file to.")
    parser.add_option("--summarize-multi-iso-samples", dest="summarize_samples", nargs=2, default=None,
		      help="Compute summary statistics of the given set of samples from multi-isoform runs. "
                      "Expects a directory with MISO output and a directory to output summary file to.")
    parser.add_option("--pool-comparisons", dest="pool_comparisons", nargs=2, default=None,
		      help="Pool comparisons files into a single file. Expects a comparisons directory "
                      "generated by MISO and an output directory, and event type provided with --event-type.")
    parser.add_option("--use-cluster", action="store_true", dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int",
		      help="Size (in number of events) of each job to chunk events file into. "
                      "Only applies when running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_path, "settings", "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int", default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int", default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform events (e.g. 'SE', 'RI', 'A3SS', ...)")

    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None,
                      help="View the contents of a gene/event that has been indexed. "\
                      "Takes as input an indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    if options.pool_comparisons != None:
        if options.event_type == None:
            print "Error: Must provide --event-type to pool comparisons"
            sys.exit(1)

        comparison_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[0]))
        output_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[1]))
        pool_comparisons(comparison_dir, options.event_type, output_dir)
    
    if options.samples_to_compare:
	sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
	    os.makedirs(output_dirname)
	ht.output_samples_comparison(sample1_dirname, sample2_dirname,
                                     output_dirname)
	
    if options.run_two_iso_event:
	if options.read_len == None or options.overhang_len == None:
	    print "Error: must provide --read-len and --overhang-len to run."
            sys.exit(1)
            
	if options.use_cluster:
	    print "Use cluster option not supported for running on a single event."
            sys.exit(1)
            
	# convert paths to absolute path names
	event_name = options.run_two_iso_event[0]
	events_filename = os.path.abspath(options.run_two_iso_event[1]) 
	psi_outdir = os.path.abspath(os.path.expanduser(options.run_two_iso_event[2])) + '/'
        
	miso_events = as_events.MISOEvents(2, options.event_type,
                                           from_file=events_filename)
        
	run_two_iso_event(event_name, options.event_type, miso_events, psi_outdir,
			  options.read_len, options.overhang_len)

    # if options.inspect_events:
    #     print "Loading events from: %s" %(options.inspect_events)
    #     miso_events = as_events.MISOEvents(2, options.event_type, from_file=options.inspect_events)
    #     print "  - Total of %d events." %(len(miso_events.events))
	
    if options.two_iso_psi_files:
	if options.read_len == None or options.overhang_len == None:
	    print "Error: must provide --read-len and --overhang-len to run."
            sys.exit(1)

	# convert paths to absolute path names
	events_filename = os.path.abspath(options.two_iso_psi_files[0]) 
	psi_outdir = os.path.abspath(options.two_iso_psi_files[1]) + '/'
	if options.use_cluster:
	    run_two_iso_on_cluster(miso_path, events_filename, options.event_type, psi_outdir,
                                   options.read_len, options.overhang_len,
                                   chunk_jobs=options.chunk_jobs)
	else:
	    if options.chunk_jobs:
		print "Error: Chunking jobs only applies when using the --use-cluster option " \
                      "to run MISO on cluster."
                sys.exit(1)
                
	    compute_two_iso_psi(events_filename, options.event_type, psi_outdir,
				options.read_len, options.overhang_len)

    ##
    ## Multiple isoforms interface based on SAM files
    ##
    if options.compute_gene_psi != None:
        if options.read_len == None:
            print "Error: must provide --read-len."
            sys.exit(1)

        paired_end = None

        if options.paired_end != None:
            paired_end = float(options.paired_end[0]), \
                         float(options.paired_end[1])

        overhang_len = 1

        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Genes to run on from GFF
        gene_ids = options.compute_gene_psi[0].split(",")

        # GFF filename describing genes
        gff_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[1]))

        # BAM filename with reads
        bam_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[2]))

        # Output directory
        output_dir = os.path.abspath(os.path.expanduser(options.compute_gene_psi[3]))

        compute_gene_psi(gene_ids, gff_filename, bam_filename, output_dir,
                         options.read_len, overhang_len, paired_end=paired_end,
                         event_type=options.event_type)


    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
	samples_label = os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
							  'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename)

    if options.view_gene != None:
        indexed_gene_filename = os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            return

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Пример #22
0
#!/usr/bin/env python
import os, sys
import GFF

input = sys.argv[1]
output = input[:input.rfind('.')] + '.collapsed.gff'

f = open(output, 'w')
reader = GFF.gmapGFFReader(input)
for r in reader: GFF.write_collapseGFF_format(f, r)
f.close()
Пример #23
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, "COGENT.DONE")):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, "in.trimmed.fa.cogent2.gff")
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, "in.fa")), "fasta"):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, "cogent2.fa." + genome1 + ".gff")
    )
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, "cogent2.fa." + genome2 + ".gff")
    )

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write(
        "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
            len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1)
        )
    )
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write(
        "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
            len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2)
        )
    )

    in_aligned_to_genome1 = os.path.join(dirname, "in.trimmed.fa." + genome1 + ".gff")
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds), len(scaffolds), cov, acc))
Пример #24
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir,
                     read_len, overhang_len, paired_end=None, event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: no such GFF file as %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)


    # Load the genes from the GFF
#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
#    t2 = time.time()
#    print "  - Loading took: %.2f seconds" %(t2 - t1)
        
    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None
        
        if settings and "sam_template" in settings:
            template = settings["sam_template"]
        
        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, read_len,
                                                overhang_len,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(num_isoforms,
                                                                mean_frag_len,
                                                                frag_variance,
                                                                read_len,
                                                                overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)
            
        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters, reads, gene_obj,
                            hyperparameters, sampler_params,
                            output_filename, burn_in=burn_in,
                            lag=lag)
Пример #25
0
    #test Fasta class (loading, editing, writing)
    test_fasta = Fasta("/home/brant/py_util/unit_test_data/seq.fasta")

    print "%s\n%s" % (test_fasta.filename, test_fasta.seq_len())
    for k in test_fasta.iterkeys():
        test_fasta[k] += "TGGCG"
    test_fasta.write_to_file("/home/brant/temp/temp.fa", 1)

    print "%s\n%s" % (test_fasta.filename, test_fasta.seq_len())

    other_test_fasta = Fasta("/home/brant/temp/temp.fa")
    print other_test_fasta.seq_len()
    #end Fasta test
    print other_test_fasta.order

    print "test substr_from_gff\n"
    import GFF
    seqfile = os.path.join(paths['py_testdata'], "eve.ceratitis_capitata.fa")
    gfffile = os.path.join(paths['py_testdata'],
                           "eve.ceratitis_capitata.fa.gff3")
    seq = Fasta(seqfile)
    gff = GFF.File(gfffile)
    evegene = seq.substr_from_gff([
        region for region in gff if 'gene_name' in region['attributes'].keys()
        and region['attributes']['gene_name'] == 'eve'
    ],
                                  name_key='gene_name',
                                  plus_strand=1)
    print evegene
Пример #26
0
  help    = ".gff file")

parser.add_argument("--out",
  metavar = "STRING",
  type    = str,
  help    = "Output directory",
  default = ".")

args = parser.parse_args()

filBAM = args.bam
name   = os.path.splitext(os.path.basename(filBAM))[0]
filGFF = args.gff

# Open the gff file.
gffHandle = GFF.parse(open(filGFF))
# Open the bam file
bamHandle = pysam.AlignmentFile(filBAM, "rb")

### Functions for read directions ###
def Forward(read):
	if read.is_reverse: return False
	else: return True

def Reverse(read):
	if read.is_reverse: return True
	else: return False

### CALCULATE COVERAGE AND ANNOTATION FOR EVERY LOCUS_TAG ###
coverages=list()
Пример #27
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir,
                          use_cluster=False, chunk_jobs=200,
                          overhang_len=1, paired_end=None,
                          settings=None):
    """
    Compute Psi values for genes using a GFF and a BAM filename.
    """
    gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir)

    num_genes = len(gene_ids_to_gff_index.keys())

    miso_run = os.path.join(miso_path, "run_miso.py")

    print "Computing gene-level Psi for %d genes..." \
          %(num_genes)
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    
    # All commands to run
    all_miso_cmds = []

    for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems():
        miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \
                   %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir,
                     read_len)
        
        if paired_end != None:
            # Run in paired-end mode
            frag_mean = float(paired_end[0])
            frag_sd = float(paired_end[1])
            miso_cmd += " --paired-end %.1f %.1f" %(frag_mean, frag_sd)
        else:
            miso_cmd += " --overhang-len %d" %(overhang_len)

        # Add settings filename if given
        if settings != None:
            miso_cmd += " --settings-filename %s" %(settings)

        if use_cluster:
            # If asked to use cluster, accumulate the MISO commands
            # but do not run them
            all_miso_cmds.append(miso_cmd)
        else:
            print "  - Executing: %s" %(miso_cmd)
            os.system(miso_cmd)

    miso_settings = Settings.load(settings)

    if use_cluster:
        # Threshold for putting jobs in the long queue
        long_thresh = 50

        # Delay between jobs
        delay_constant = 0.9
        
        # Invoke the commands using the cluster
        print "Sending %d genes to be run on cluster in chunks of %d..." \
              %(num_genes, chunk_jobs)

        if not chunk_jobs:
            print "  - Using default chunk jobs = %d" %(200)
            chunk_jobs = 200

	chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs))))

        # Split the gene records into batches
	cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs)

        time_str = time.strftime("%m-%d-%y_%H:%M:%S")

        for batch_num, batch in enumerate(cmd_batches):
            batch_size = len(batch)
            print "Running batch %d (batch size = %d)" %(batch_num,
                                                         batch_size)

            if batch_size >= long_thresh:
                queue_type = "long"
            else:
                queue_type = "short"
            
            # Pool all the MISO commands belonging to this batch
            batch_logs_dir = os.path.join(output_dir, "batch-logs")
            if not os.path.isdir(batch_logs_dir):
                os.makedirs(batch_logs_dir)
            batch_logfile = os.path.join(batch_logs_dir,
                                         "batch-%d-%s.log" %(batch_num,
                                                             time_str))
            redirected_output = " >> %s;\n" %(batch_logfile)
            cmd_to_run = redirected_output.join(batch)

            # Run on cluster
            job_name = "gene_psi_batch_%d" %(batch_num)
            cluster_utils.run_on_cluster(cmd_to_run, job_name, output_dir,
                                         queue_type=queue_type,
                                         settings=settings)
            time.sleep(delay_constant)
Пример #28
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.gff')
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.gff'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome2 + '.gff'))

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info),
                                      len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
        len(contig_genome1), cov1, acc1, has_chimeric1,
        ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
        len(contig_genome2), cov2, acc2, has_chimeric2,
        ",".join(contig_genome2)))

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.gff')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage
                                                            for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname,
                                               ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds),
                                                   len(scaffolds), cov, acc))
Пример #29
0
def main():
	GTFgen = GFF.parse(GTFfile)
	GFFlist = makeGFFlist(GTFgen)
	ucscIDlist, transcriptdict = get_mRNA_sequence(GFFlist)
	write_utr_stopcodon_csvfile(ucscIDlist, transcriptdict)
Пример #30
0
gffSource = 'genscan'
gffClass = 'Genscan'
featureType = 'gene'
subfeatureType = 'exon'

for gene in data:
    name = gene[0]['gene.exon'].split('.')[0]
    output = []
    extrema = []
    for exon in gene:
        if exon['type'] in ['Init', 'Intr', 'Term', 'Sngl']:
            output.append(
                GFF.output(reference=reference,
                           source=gffSource,
                           type=subfeatureType,
                           start=exon['start'] + offset - 1,
                           end=exon['end'] + offset - 1,
                           strand=exon['strand'],
                           score=exon['score'],
                           group='%s %s' % (gffClass, name)))
            extrema.append(exon['start'])
            extrema.append(exon['end'])

    output.insert(
        0,
        GFF.output(reference='scaffold_42',
                   source=gffSource,
                   type=featureType,
                   start=min(extrema) + offset - 1,
                   end=max(extrema) + offset - 1,
                   strand=gene[0]['strand'],
                   group='%s %s' % (gffClass, name)))
Пример #31
0
def compute_all_genes_psi(gff_dir,
                          bam_filename,
                          read_len,
                          output_dir,
                          use_cluster=False,
                          chunk_jobs=200,
                          overhang_len=1,
                          paired_end=None,
                          settings=None):
    """
    Compute Psi values for genes using a GFF and a BAM filename.
    """
    gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir)

    num_genes = len(gene_ids_to_gff_index.keys())

    miso_run = os.path.join(miso_path, "run_miso.py")

    print "Computing gene-level Psi for %d genes..." \
          %(num_genes)
    print "  - GFF index: %s" % (gff_dir)
    print "  - BAM: %s" % (bam_filename)
    print "  - Read length: %d" % (read_len)
    print "  - Output directory: %s" % (output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    # All commands to run
    all_miso_cmds = []

    for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems():
        miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \
                   %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir,
                     read_len)

        if paired_end != None:
            # Run in paired-end mode
            frag_mean = float(paired_end[0])
            frag_sd = float(paired_end[1])
            miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd)
        else:
            miso_cmd += " --overhang-len %d" % (overhang_len)

        # Add settings filename if given
        if settings != None:
            miso_cmd += " --settings-filename %s" % (settings)

        if use_cluster:
            # If asked to use cluster, accumulate the MISO commands
            # but do not run them
            all_miso_cmds.append(miso_cmd)
        else:
            print "  - Executing: %s" % (miso_cmd)
            os.system(miso_cmd)

    miso_settings = Settings.load(settings)

    if use_cluster:
        # Threshold for putting jobs in the long queue
        long_thresh = 50

        # Delay between jobs
        delay_constant = 0.9

        # Invoke the commands using the cluster
        print "Sending %d genes to be run on cluster in chunks of %d..." \
              %(num_genes, chunk_jobs)

        if not chunk_jobs:
            print "  - Using default chunk jobs = %d" % (200)
            chunk_jobs = 200

        chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs))))

        # Split the gene records into batches
        cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs)

        time_str = time.strftime("%m-%d-%y_%H:%M:%S")

        for batch_num, batch in enumerate(cmd_batches):
            batch_size = len(batch)
            print "Running batch %d (batch size = %d)" % (batch_num,
                                                          batch_size)

            if batch_size >= long_thresh:
                queue_type = "long"
            else:
                queue_type = "short"

            # Pool all the MISO commands belonging to this batch
            batch_logs_dir = os.path.join(output_dir, "batch-logs")
            if not os.path.isdir(batch_logs_dir):
                os.makedirs(batch_logs_dir)
            batch_logfile = os.path.join(
                batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str))
            redirected_output = " >> %s;\n" % (batch_logfile)
            cmd_to_run = redirected_output.join(batch)

            # Run on cluster
            job_name = "gene_psi_batch_%d" % (batch_num)
            cluster_utils.run_on_cluster(cmd_to_run,
                                         job_name,
                                         output_dir,
                                         queue_type=queue_type,
                                         settings=settings)
            time.sleep(delay_constant)
Пример #32
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf."  )

    parser.set_defaults( as_gtf = False,
                         id_format = "%08i",
                         test = None )
    
    (options, args) = E.Start( parser, add_pipe_options = True )

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GFF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator( options.stdin ):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start 
        gff.end = bed.end
        if bed.mFields and len(bed.mFields) >= 3:
            gff.strand = bed.mFields[2]
        else: 
            gff.strand = "."

        if bed.mFields and len(bed.mFields) >= 2:
            gff.score = bed.mFields[1]
        
        
        if as_gtf:
            if bed.mFields:
                gff.gene_id = bed.mFields[0]
                gff.transcript_id = bed.mFields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id            
        else:
            if bed.mFields:
                gff.source = bed.mFields[0]
            
        options.stdout.write( str(gff) + "\n" )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped) )

    E.Stop()
Пример #33
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--GFFfile', help='coding GFF or GTF file')
    parser.add_argument('--bamfile1', help='bamfile from first genome mapping')
    parser.add_argument('--bamfile2', help='bamfile from polyA removal')
    parser.add_argument('--densitypathandfilestring',
                        help='density file output path')
    parser.add_argument('--wigpathandfile', help='wig file output path')
    parser.add_argument('--totreads',
                        default=-1,
                        help='total reads for normalization')
    parser.add_argument('--assignment', help='5 or 3 end', required=True)
    parser.add_argument('--riboshiftdict', help='dictionary of riboshifts')
    parser.add_argument('--bamfileoutput', help='output bam file')
    parser.add_argument('--softclipped', help='number of soft clipped allowed')
    args = parser.parse_args()

    import ast
    riboshiftdict = ast.literal_eval(args.riboshiftdict)
    GFFgen = GFF.parse(args.GFFfile)
    bamgen0 = pysam.AlignmentFile(args.bamfile1, "rb")
    bamfileout = pysam.AlignmentFile(args.bamfileoutput,
                                     "wb",
                                     template=bamgen0)
    rfpdense = densebuilder(GFFgen, args.bamfile1, args.bamfile2,
                            args.densitypathandfilestring, args.wigpathandfile,
                            args.totreads, args.assignment, riboshiftdict,
                            bamfileout, args.softclipped)
    rfpdense.setdense()
Пример #34
0
    parser.add_argument('--threshold',
                        default=1,
                        help='thresholding read counts')
    parser.add_argument('--totreads',
                        default=-1,
                        help='reads for normalization')
    parser.add_argument('--outputdata', help='output data filepath')
    parser.add_argument('--bamfileoutput', help='output bam file')
    parser.add_argument
    args = parser.parse_args()

    import ast
    riboshiftdict = ast.literal_eval(
        args.riboshiftdict)  #convert string into dictionary
    print riboshiftdict
    print "parsing gff..."
    GTFgen = GFF.parse(args.GTFfile)
    print "loading bam file..."
    bamfile = pysam.AlignmentFile(args.bamfileinput, "rb")
    print "loading genome..."
    genome = twobitreader.TwoBitFile(args.twobitfile)
    print "writing bam out file..."
    bamfileout = pysam.AlignmentFile(args.bamfileoutput,
                                     "wb",
                                     template=bamfile)

    rfpdense = densebuilder(bamfile, GTFgen, genome, riboshiftdict,
                            int(args.threshold), args.totreads,
                            args.outputdata, args.assignment, bamfileout)
    rfpdense.builddense()
Пример #35
0
#gff sqlite action

import GFF, os, sqlite3


def InsertGFFRegion(curobj, vals):
    curobj.execute('INSERT INTO gff VALUES (null,?,?,?,?,?,?,?,?,?,?)', vals)


gff_filename = r"G:\AllBrantsStuff\python\ephinaroun\sqlite\dmel-all-r4.3.filtered.gff"
DB_filename = os.path.join(os.path.dirname(gff_filename),
                           '.' + os.path.basename(gff_filename) + '.DB')

gff = GFF.File(gff_filename)
connection = sqlite3.connect(DB_filename)
cursor = connection.cursor()

try:
    cursor.execute('drop table gff')
    connection.commit()
except:
    pass
cursor.execute('''CREATE TABLE gff (
				id INTEGER PRIMARY KEY AUTOINCREMENT,
				sequence_name TEXT NOT NULL,
				source TEXT NOT NULL,
				type TEXT NOT NULL,
				start INTEGER NOT NULL,
				end INTEGER NOT NULL,
				score REAL NOT NULL,
				strand TEXT NOT NULL,
def main():
    GTFgen = GFF.parse(GTFfile)
    GFFlist = makeGFFlist(GTFgen)
    ucscIDlist, transcriptdict = build_utr3_stop_positions(GFFlist)
    write_utr_stopcodon_csvfile(ucscIDlist, transcriptdict)
def main():
    GTFgen = GFF.parse(GTFfile)
    GFFlist = makeGFFlist(GTFgen)
    find_uORFs(GFFlist)