Exemplo n.º 1
0
def calculate_total_lengths(db_file, species):
    """"
    Prints number of bases labeled as CDS, UTR, and intron in a gtf db file.
    Can be used to generate ratios for region distribution pie charts.
    
    :param db_file: gffutils.FeatureDB
    :param species: string
        used to generate chr19_keys corresonding to GTF column nomenclature.
    :return: 
    """
    keys = af.get_keys(species)
    db = gffutils.FeatureDB(db_file)

    cds_dict = af.get_all_cds_dict(db, keys['cds'])
    exons_dict = af.get_all_exons_dict(db,
                                       exon_key=keys['exon'],
                                       transcript_id_key=keys['transcript_id'])
    transcripts_dict = af.get_all_transcripts_dict(
        db,
        transcript_key=keys['transcript'],
        transcript_id_key=keys['transcript_id'])

    calculate_total_cds_length(db, keys)
    calculate_total_utr_lengths(db, cds_dict, keys)
    calculate_total_intron_lengths(db, exons_dict, transcripts_dict, keys)
def getCDScoords(gff):
    allCDScoords = {
    }  #{ENSGENE_strand : [[cdsexon1start, cdsexon1stop], [cdsexon2start, cdsexon2stop]]}
    genecount = 0
    noCDSexonscount = 0

    #Make gff database
    print 'Indexing gff...'
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge')

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    genes = db.features_of_type('gene')

    for gene in genes:
        genecount += 1

        #If this gene has 0 coding exons (for example a ncRNA), skip it
        CDS_count = len(list(db.children(gene, featuretype='mRNA')))
        if CDS_count == 0:
            noCDSexonscount += 1
            continue

        CDSlengths = {}  #{transcriptID : combined_length_of_coding_exons}
        CDScoords = {
        }  #{transcriptID : [[cdsexon1start, cdsexon1stop], [cdsexon2start, cdsexon2stop]]}
        genename = str(gene.id)
        chrm = str(gene.chrom)
        strand = gene.strand
        for transcript in db.children(gene,
                                      featuretype='mRNA',
                                      order_by='start'):
            transcriptID = str(transcript.id)
            CDScoords[transcriptID] = []
            CDSlength = 0
            for codingexon in db.children(transcript,
                                          featuretype='CDS',
                                          order_by='start'):
                CDScoords[transcriptID].append(
                    [codingexon.start, codingexon.end])
                exonlength = codingexon.end - codingexon.start
                CDSlength += exonlength
            CDSlengths[transcriptID] = CDSlength

        longestcds = max(CDSlengths.iterkeys(),
                         key=(lambda key: CDSlengths[key]))
        for transcript in CDScoords:
            if transcript == longestcds:
                allCDScoords[genename + '_' + chrm + '_' +
                             strand] = CDScoords[transcript]

    os.remove(db_fn)

    print 'Looked through {0} genes. {1} of them had no coding exons. Found longest CDS sequences for {2} of them.'.format(
        genecount, noCDSexonscount, len(allCDScoords))
    return allCDScoords
Exemplo n.º 3
0
def openGTF(gtfPath, verbose=True):
    try:
        gtf = gffutils.FeatureDB("{}.db".format(gtfPath), keep_order=True)
    except ValueError:
        if verbose:
            eprint("Indexing...")
        gtf = gffutils.create_db(gtfPath,
                                 dbfn="{}.db".format(gtfPath),
                                 force=True,
                                 keep_order=True,
                                 disable_infer_genes=True,
                                 disable_infer_transcripts=True,
                                 merge_strategy='merge',
                                 sort_attribute_values=True)
        gtf = gffutils.FeatureDB("{}.db".format(gtfPath), keep_order=True)
    return gtf
Exemplo n.º 4
0
def open_gtf(gtf_path):
    with_genes = False
    with_transcripts = False
    for line in open(gtf_path):
        if line.startswith('#'):
            continue
        feat_type = line.split("\t")[2]
        if feat_type == "gene":
            with_genes = True
        elif feat_type == "transcript":
            with_transcripts = True
        else:
            # TODO maybe we can use a break. There are pros and cons
            # The first feature is not always a gene.
            continue
    try:
        gtf = gffutils.FeatureDB("{}.db".format(gtf_path), keep_order=True)
    except ValueError:
        gtf = gffutils.create_db(gtf_path,
                                 dbfn="{}.db".format(gtf_path),
                                 force=True,
                                 keep_order=True,
                                 disable_infer_genes=with_genes,
                                 disable_infer_transcripts=with_transcripts,
                                 merge_strategy="merge",
                                 sort_attribute_values=True)
    return gtf
Exemplo n.º 5
0
    def __init__(self, data, dbfn=None, id_column='id', csv2rec_kwargs=None):
        """
        Generic class for handling tables of data.

        :param data: If a string, assume it's a filename and load that using
            `csv2rec_kwargs`. Otherwise, assume it's a record array.
        :param dbfn: Filename for a `gffutils.FeatureDB`. Optional, but really
            handy.
        :param id_column: Which column contains gene accessions that can be
            looked up in the `gffutils.FeatureDB`.
        :param csv2rec_kwargs: Kwargs passed to `matplotlib.mlab.csv2rec`.
            Default is dict(delimiter="\\t", missing="NA").
        """
        if csv2rec_kwargs is None:
            csv2rec_kwargs = dict(delimiter='\t', missing='NA')

        if isinstance(data, basestring):
            data = csv2rec(data, **csv2rec_kwargs)

        self.id_column = id_column
        self.data = data
        self.dbfn = dbfn
        self.gffdb = None
        if self.dbfn:
            self.gffdb = gffutils.FeatureDB(dbfn)
        self._cached_lookup = None
def extract_gene_gff(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir):
    global_names=globals()
    for db_file in strain_db_dir_path.iterdir():
        global_names[db_file.stem.strip()+"_db"]=gffutils.FeatureDB(db_file)
    global_names["MGG_db"]=SeqIO.index("../../70-15_refference_genome/magnaporthe_oryzae_70-15_8_genes.fasta","fasta")
    gene_file_path=all_row_gene_fasta_dir/(all_row_gene_list_file.stem+"_all_row_gene_fasta.fasta")
    if gene_file_path.exists() is True:return
    with gene_file_path.open("w") as out_fl:
        with all_row_gene_list_file.open() as protein_id_list:
            for protein_id in protein_id_list:
                protein_id_list=protein_id.split("_",1)
                strain_id=protein_id_list[0]
                gff_protein_id=protein_id_list[1]
                if strain_id=="MGG":
                    MGG_SeqRecord=globals().get("MGG_db")[protein_id[0:9]]
                    MGG_SeqRecord_amend=SeqRecord(
                        MGG_SeqRecord.seq,
                        id=protein_id[0:9],
                        description=""
                        )
                    SeqIO.write(MGG_SeqRecord_amend,out_fl,"fasta")
                else:
                    if strain_id=="WD-3-1":
                        strain_id=protein_id[0:8]
                        gff_protein_id=protein_id[9:]
                    gene_plain_sequences=globals().get(strain_id+"_db")[gff_protein_id].sequence(
                        str(contig_path/(strain_id+".fasta")),
                        use_strand=True
                        )
                    record = SeqRecord(
                        Seq(gene_plain_sequences,Bio.Alphabet.IUPAC.unambiguous_dna),
                        id=protein_id,
                        description=""
                        )
                    SeqIO.write(record, out_fl,"fasta")
Exemplo n.º 7
0
def getjunjieregions(junjiegff):
	#Make gff databases
	print 'Indexing gff...'
	gff_fn = junjiegff
	db_fn = os.path.basename(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, verbose = True)
	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	junjieregions = {} #{chrm : {strand : [list of nt in a junjie region]}}

	regions = db.features_of_type('region')

	for region in regions:
		for exon in db.children(region, featuretype = 'exon'):
			chrm = exon.chrom
			strand = exon.strand
			nt = range(exon.start, exon.end + 1)

			if chrm not in junjieregions:
				junjieregions[chrm] = {}
			if strand not in junjieregions[chrm]:
				junjieregions[chrm][strand] = []

			junjieregions[chrm][strand] += nt

	#Remove duplicates
	for chrm in junjieregions:
		for strand in junjieregions[chrm]:
			nt = junjieregions[chrm][strand]
			junjieregions[chrm][strand] = list(set(nt))

	os.remove(db_fn)
	return junjieregions
Exemplo n.º 8
0
def get_gtf_db(gtf, in_memory=False):
    """
    create a gffutils DB
    """
    db_file = gtf + ".db"
    if file_exists(db_file):
        return gffutils.FeatureDB(db_file)
    db_file = ":memory:" if in_memory else db_file
    if in_memory or not file_exists(db_file):
        infer_extent = guess_infer_extent(gtf)
        db = gffutils.create_db(gtf, dbfn=db_file,
                                infer_gene_extent=infer_extent)
    if in_memory:
        return db
    else:
        return gffutils.FeatureDB(db_file)
Exemplo n.º 9
0
def main():
    cds_file, gtf_file, out_file = parse_args(sys.argv[1:])
    cdsinfo = make_cdsList(cds_file)
    outfn = open(out_file, "w+")
    # Make GFFdb, store db in tmpGFF.db file
    make_GFFdb(gtf_file)
    db = gffutils.FeatureDB('tmpGFF.db', keep_order=True)
    for mRNA in db.features_of_type("transcript", order_by="start"):
        # print transcript_id
        print("Processing transcript_id: ", mRNA["transcript_id"][0])
        # print transcript
        print(mRNA, file=outfn)

        # print original exons
        for i in db.children(mRNA, featuretype="exon", order_by="start"):
            print(i, file=outfn)

        transcript = mRNA.id
        exons = get_exonList(db, mRNA)
        # Some transcript doesn't have CDS
        # Need to check first
        if transcript in cdsinfo:
            cds_start = cdsinfo[transcript][0]
            cds_end = cdsinfo[transcript][1]
            fiveUTR, threeUTR, cds_exons = Get_cds(exons, cds_start, cds_end,
                                                   mRNA.strand)

            #Disable printing fiveUTR and threeUTR
            #for i in fiveUTR:
            #    print(i, file = outfn)
            #for i in threeUTR:
            #    print(i, file = outfn)
            for i in cds_exons:
                print(i, file=outfn)
    outfn.close()
Exemplo n.º 10
0
def getnoncodingtxs(gff, genesofint):
    #Given a list of genes, figure out which ones have 0 coding transcripts.
    #Then go through and get the longest transcript for that gene.
    geneandtx = {}  #{ENSMUSG : ENSMUST}
    geneboundaries = {}  #{ENSMUSG : [genestart, genestop]}
    genecount = 0
    noncodinggenecount = 0

    #Make gff database
    print 'Indexing gff...'
    gff_fn = gff
    db_fn = os.path.abspath(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True)

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    genes = db.features_of_type('gene')

    for gene in genes:
        geneid = str(gene.id).replace('gene:', '').split('.')[0]
        if geneid in genesofint:
            genecount += 1
            iscoding = seeifcoding(gene, db)
            if not iscoding:
                noncodinggenecount += 1
                longesttx = getlongesttranscript(gene, db)
                geneandtx[geneid] = longesttx

    print 'Looked for {0} genes in gff and found {1} of them. Of these {2} had no coding transcripts.'.format(
        len(genesofint), genecount, noncodinggenecount)

    return geneandtx
Exemplo n.º 11
0
def main():
    # Get database file
    db_fn = args.gffFile
    db = gffutils.FeatureDB(db_fn)
    genes = db.features_of_type('gene', order_by='start')

    genelist = np.genfromtxt(args.geneList, delimiter=',', dtype=None)
    genelist_tpose = np.transpose(genelist)

    # Open output file
    with open(args.outputFile, 'wb') as outputFile:
        # Write column headings

        for gene in genes:
            for i in range(0, len(genelist_tpose)):
                if gene.id == genelist_tpose[i]:
                    gene_start = gene.start
                    gene_stop = gene.end
                    genename = gene.id
                    genechrom = gene.chrom

                    # Upstream and downstram positions indicated by command line arguements
                    totalstart = gene_start - args.upstreamSize
                    totalstop = gene_stop + args.downstreamSize
                    bedArray = [genechrom, totalstart, totalstop, genename]
                    outputFile.write("\t".join(str(i)
                                               for i in bedArray) + "\n")
Exemplo n.º 12
0
def classifygenes(exoniccoords, gff):
    #exoniccoords = {} #{geneid : {txid : [positionfactor, [set of exonic coords]]}}
    genetypes = {}  #{geneid : type of 3' UTR end}
    print 'Indexing gff...'
    gff_fn = gff
    db_fn = os.path.abspath(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True)

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    for gene in exoniccoords:
        geneexoniccoords = exoniccoords[gene]
        coulditbeALE = isitALE(geneexoniccoords, db)
        coulditbeTUTR = isitTUTR(geneexoniccoords, db)
        if coulditbeALE == True and coulditbeTUTR == False:
            genetype = 'ALE'
        elif coulditbeALE == False and coulditbeTUTR == True:
            genetype = 'TUTR'
        elif coulditbeALE == False and coulditbeTUTR == False:
            genetype = 'mixed'
        elif coulditbeALE == True and coulditbeTUTR == True:
            genetype = 'ERROR'

        genetypes[gene] = genetype

    return genetypes
Exemplo n.º 13
0
def main(args):

    logging.info("GTF:{gtf_db}, Transtypes:{transtypes}, outfile:{out}".format(
        gtf_db=args.gtf_db, transtypes=args.transtypes, out=args.out))

    filtered_ids = set()

    # GTF db to be imported for filtering
    gtf_db = gffutils.FeatureDB(args.gtf_db)

    # Lets go through all the genes in the gtf
    for gene in gtf_db.features_of_type('gene'):
        # For every gene iterate over its transcripts
        for transcript in gtf_db.children(gene, featuretype='transcript'):
            # If it has any of the filtered attributes the process it
            if set(transcript.attributes['transcript_type']).intersection(
                    args.transtypes):
                # If it is a single transcript gene we just remove that gene itself
                if len(list(gtf_db.children(gene,
                                            featuretype='transcript'))) == 1:
                    filtered_ids.add(gene.id)
                # Else remove just the transcripts
                else:
                    filtered_ids.add(transcript.id)

    # Write the filtered genes/transcripts to an outfile
    with open(args.out, "w") as out_handle:
        for filt_id in sorted(list(filtered_ids)):
            out_handle.write("{}\n".format(filt_id))
Exemplo n.º 14
0
def sanitize_gff_file(gff_fname,
                      in_memory=True,
                      in_place=False):
    """
    Sanitize a GFF file.
    """
    db = None
    if is_gff_db(gff_fname):
        # It's a database filename, so load it
        db = gffutils.FeatureDB(gff_fname)
    else:
        # Need to create a database for file
        if in_memory:
            db = gffutils.create_db(gff_fname, ":memory:",
                                    verbose=False)
        else:
            db = get_gff_db(gff_fname)
    if in_place:
        gff_out = gffwriter.GFFWriter(gff_fname,
                                      in_place=in_place)
    else:
        gff_out = gffwriter.GFFWriter(sys.stdout)
    sanitized_db = sanitize_gff_db(db)
    for gene_rec in sanitized_db.all_features(featuretype="gene"):
        gff_out.write_gene_recs(sanitized_db, gene_rec.id)
    gff_out.close()
Exemplo n.º 15
0
def gff2bed12(gff, bedoutfile, nameoutfile):
    #Make gff database
    print 'Indexing gff...'
    gff_fn = gff
    db_fn = os.path.abspath(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True)

    db = gffutils.FeatureDB(db_fn)
    print 'Done indexing!'

    transcripts = db.features_of_type('transcript')

    transcriptcounter = 0
    with open(bedoutfile, 'w') as bedoutfh, open(nameoutfile,
                                                 'w') as nameoutfh:
        for transcript in transcripts:
            transcriptcounter += 1
            if transcriptcounter % 1000 == 0:
                print 'Converting transcript {0}...'.format(transcriptcounter)
            bed12line = gffutils.FeatureDB.bed12(db, transcript)
            bed12line = bed12line.split('\t')
            bed12line[3] = bed12line[3].split('.')[0]
            bedoutfh.write(('\t').join(bed12line) + '\n')
            txid = transcript.id.split('.')[0]
            for gene in db.parents(transcript, featuretype='gene'):
                geneid = gene.id.split('.')[0]
            nameoutfh.write(txid + '\t' + geneid + '\n')
Exemplo n.º 16
0
def get_sequence(gff_folder: str, name: str, gene_id: str):
    sequence = None
    db_file = os.path.join(gff_folder, "{}.db".format(name))
    gffdb = gffutils.FeatureDB(db_file)
    item = gffdb[gene_id]
    start = item.start
    end = item.end
    strand = item.strand
    fasta_temp = tempfile.NamedTemporaryFile('w', suffix=".fa", delete=False)
    fasta_temp.close()
    read = False
    try:
        with Popen(['wget', '-O', fasta_temp.name,
                    'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=fasta&id={}&from={}&to={}'
                            .format(name, start, end)]) as proc:
            proc.communicate()
            exit_code = proc.wait()
            if exit_code != 0:
                print("ERROR failed to download fasta for {}".format(name))
            else:
                read = True
        if read:
            with open(fasta_temp.name, "rU") as handle:
                for record in SeqIO.parse(handle, "fasta"):
                    sequence = record.seq
                    if strand == '-':
                        sequence = sequence.reverse_complement()
    finally:
        try:
            os.remove(fasta_temp.name)
        except:
            pass
    return sequence
Exemplo n.º 17
0
def annotate_combined_count_file(count_file, gtf_file, out_file=None):
    dbfn = gtf_file + ".db"
    if not file_exists(dbfn):
        return None

    if not gffutils:
        return None

    db = gffutils.FeatureDB(dbfn, keep_order=True)

    if not out_file:
        out_dir = os.path.dirname(count_file)
        out_file = os.path.join(out_dir, "annotated_combined.counts")

    # if the genes don't have a gene_id or gene_name set, bail out
    try:
        symbol_lookup = {
            f['gene_id'][0]: f['gene_name'][0]
            for f in db.features_of_type('exon')
        }
    except KeyError:
        return None

    df = pd.io.parsers.read_table(count_file, sep="\t", index_col=0, header=0)

    df['symbol'] = df.apply(lambda x: symbol_lookup.get(x.name, ""), axis=1)
    df.to_csv(out_file, sep="\t", index_label="id")
    return out_file
Exemplo n.º 18
0
def main(database, nmd_file):
    dbhuman = gffutils.FeatureDB(database, keep_order=True)
    event_ids_df = possible_nmd(nmd_file)
    # if negative correlation
    event_ids_df = include_exon_nmd(event_ids_df, dbhuman)
    # if positive correlation
    # event_ids_df = exclude_exon_nmd(event_ids_df)
    return event_ids_df
Exemplo n.º 19
0
 def test__get_all_transcripts_overlapping_exon(self, database,
                                                single_exon_id, nmd_exons,
                                                all_transcripts_of_exon):
     db = gffutils.FeatureDB(database)
     exon = db[single_exon_id]
     test = nmd_exons._get_all_transcripts_overlapping_exon(exon)
     true = all_transcripts_of_exon
     assert test == true
Exemplo n.º 20
0
def main():
    db = gffutils.FeatureDB(snakemake.input.db)
    gene = to_bedtool(db.features_of_type("gene")).saveas()
    slopped = gene.slop(b=100, genome="dm6")
    merged = slopped.sort().merge()
    complement = merged.complement(genome="dm6").saveas()
    bed = complement.each(interName).saveas(snakemake.output.bed)
    bed.each(interGFF).saveas(snakemake.output.gtf)
Exemplo n.º 21
0
def prep(
    bam_file="/projects/bio/rrna/data/rsubread_aligned_v2/SRR891244_s.bam",
    gtf_db_file="/projects/bio/rrna/data/annotations/Homo_sapiens.GRCh38.84.features_name.db"
):
    # bam, gtf_db = cc.prep()
    bam = pysam.AlignmentFile(bam_file, "rb")
    gtf_db = gffutils.FeatureDB(gtf_db_file)
    return bam, gtf_db
Exemplo n.º 22
0
def annotation_bed12(annotation_db):
    import gffutils
    db = gffutils.FeatureDB(annotation_db)
    bed12 = '.'.join(annotation_db.strip().split('.')[:-2]) + '.bed12'
    with open(bed12, 'w') as handle:
        for t in db.features_of_type('transcript'):
            handle.write(db.bed12(t, name_field='transcript_id') + '\n')
    return bed12
Exemplo n.º 23
0
def filter_by_attribute(attr_in, dbfn):
    db = gffutils.FeatureDB(dbfn)

    for feat in db.all_features():
        if feat.attributes['gene_biotype'] != [attr_in]:
            continue

        print(feat)
Exemplo n.º 24
0
def main(dbfile, fafile, outprefix):
    """
    统计之前需要先试用gff2sqliteDB.py把gff3转换为database文件
    """
    db = gffutils.FeatureDB(dbfile, keep_order=True)
    seqs = Fasta(fafile)
    gene_stat(db, seqs, outprefix)
    mRNA_stat(db, seqs, outprefix)
Exemplo n.º 25
0
def convertGffToBedGffUtils(gffFile):
    fn = gffutils.example_filename(gffFile)
    # db = gffutils.create_db(fn, dbfn=gffFile[:-4] + '.db', force=True, keep_order=True, \
    # 	merge_strategy='merge', sort_attribute_values=True)
    db = gffutils.FeatureDB(gffFile[:-4] + '.db', keep_order=True)
    gffIterator = db.all_features(order_by='start')
    bedVersion = pybedtools_integration.to_bedtool(gffIterator)
    print bedVersion
Exemplo n.º 26
0
def translatePHB47():
    import gffutils
    lookup = defaultdict(str)
    db = gffutils.FeatureDB('/home/maize/shared/databases/genomes/Zea_mays/PHB47/Zmaysvar.PHB47v1.1.gene_exons.gff3.db', keep_order=True)
    for gene in db.features_of_type('gene', order_by='start'):
        for i in db.children(gene, featuretype='mRNA', order_by='start'):
            lookup[i['Name'][0]] = gene['Name'][0]
    return lookup
Exemplo n.º 27
0
def db2gtf(gtf, db):
    logger.info(
        "Converting gene annotation file to .gtf format (takes a while)...")
    with open(gtf, "w") as f:
        for record in gffutils.FeatureDB(db).all_features(order_by=('seqid',
                                                                    'start')):
            f.write(str(record) + '\n')
    logger.info("Gene database written to " + gtf)
Exemplo n.º 28
0
class TestGeneInfo:
    source_dir = os.path.dirname(os.path.realpath(__file__))
    gffutils_db = gffutils.FeatureDB(os.path.join(source_dir,
                                                  'toy_data/synth.db'),
                                     keep_order=True)
    gene_db = gffutils_db['ENSMUSG00000020196.10']

    def test_basic(self):
        gene_info = GeneInfo([self.gene_db], self.gffutils_db)
        assert gene_info.get_gene_region() == ("chr10", 1000, 10000)
        assert gene_info.transcript_start("ENSMUST00000001712.7") == 1000
        assert gene_info.transcript_end("ENSMUST00000001713.7") == 10000
        assert gene_info.transcript_region("ENSMUST00000001715.7") == (8000,
                                                                       8800)
        assert gene_info.transcript_exon_count("ENSMUST00000001713.7") == 6
        assert gene_info.total_transcript_length(
            "ENSMUST00000001712.7") == 1105
        assert gene_info.chr_id == "chr10"
        assert gene_info.isoform_strands["ENSMUST00000001713.7"] == '-'
        assert gene_info.gene_id_map[
            "ENSMUST00000001713.7"] == 'ENSMUSG00000020196.10'

    def test_intron_profiles(self):
        gene_info = GeneInfo([self.gene_db], self.gffutils_db)
        assert gene_info.intron_profiles.features[0] == (1101, 1999)
        assert gene_info.intron_profiles.features[-1] == (8201, 8499)
        assert len(gene_info.intron_profiles.features) == 9
        assert gene_info.intron_profiles.profiles["ENSMUST00000001712.7"] == [
            1, 1, -1, 1, -1, -1, 1, -1, -1
        ]
        assert gene_info.intron_profiles.profiles["ENSMUST00000001714.7"] == [
            -2, -2, -2, -2, -2, -2, -1, -1, -2
        ]
        assert gene_info.intron_profiles.profiles["ENSMUST00000001715.7"] == [
            -2, -2, -2, -2, -2, -2, -1, -1, 1
        ]

    def test_exon_profiles(self):
        gene_info = GeneInfo([self.gene_db], self.gffutils_db)
        assert gene_info.exon_profiles.features[0] == (1000, 1100)
        assert gene_info.exon_profiles.features[-1] == (9500, 10000)
        assert len(gene_info.exon_profiles.features) == 11
        assert gene_info.exon_profiles.profiles["ENSMUST00000001712.7"] == [
            1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1
        ]
        assert gene_info.exon_profiles.profiles["ENSMUST00000001714.7"] == [
            -2, -2, -2, -2, -2, -2, -2, 1, -2, -2, -2
        ]

        assert gene_info.split_exon_profiles.features[2] == (2101, 2200)
        assert gene_info.split_exon_profiles.features[-1] == (9500, 10000)
        assert len(gene_info.split_exon_profiles.features) == 11
        assert gene_info.split_exon_profiles.profiles[
            "ENSMUST00000001712.7"] == [1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1]
        assert gene_info.split_exon_profiles.profiles[
            "ENSMUST00000001714.7"] == [
                -2, -2, -2, -2, -2, -2, -2, 1, -2, -2, -2
            ]
Exemplo n.º 29
0
def main():
    # Get the database
    db_fn = args.databaseFile
    db = gffutils.FeatureDB(db_fn)
    genes = db.features_of_type('gene', order_by='start')

    # Open the output BED file
    with open(args.outputFile, 'w') as outputFile:
        for gene in genes:

            # Get all exons
            exonList = list(
                db.children(gene, featuretype='exon', order_by='start'))

            # Skip genes that only have 1 exon, or genes that don't have an exon e.g. miRNA
            if len(exonList) == 0 or len(exonList) == 1:
                continue

            # Convert to BED 0-based coordinate system. Only have to change start.
            for exon in exonList:
                exon.start = exon.start - 1

            # Make all pairwise combos
            combos = itertools.combinations(exonList, 2)

            junctionArray = []
            for combo in combos:
                if not testOverlap(*combo):
                    junctionArray.append(combo)

            # Create BED file
            for i in range(0, len(junctionArray)):
                # Only keep up to 100bp on each side of the junction
                (exon1Start, exon1Stop, exon2Start,
                 exon2Stop) = defineRegion(i, junctionArray)

                # Construct various parts of the junction BED file
                totalStart = exon1Start  # Take exon1 start
                totalStop = exon2Stop  # Take exon2 stop
                chrom = junctionArray[i][0].chrom
                strand = junctionArray[i][0].strand
                score = junctionArray[i][0].score
                color = "255,0,0"
                name = ''.join(
                    junctionArray[i][0].attributes['Name']) + '|' + ''.join(
                        junctionArray[i][1].attributes['Name'])
                num = "2"
                lengths = str(len(range(exon1Start, exon1Stop))) + ',' + str(
                    len(range(exon2Start, exon2Stop)))
                starts = str(exon1Start - totalStart) + ',' + str(exon2Start -
                                                                  totalStart)
                bedArray = [
                    chrom, totalStart, totalStop, name, score, strand,
                    totalStart, totalStop, color, num, lengths, starts
                ]

                # Output to BED file
                outputFile.write("\t".join(str(i) for i in bedArray) + "\n")
Exemplo n.º 30
0
def retrieve_splicing_gff(gffFile=None):
    info = defaultdict(dict)
    gffDbFile = gffFile + ".db"

    try:
        gffDatabase = gffutils.FeatureDB(gffDbFile)
    except:
        gffutils.create_db(gffFile, gffDbFile)
        gffDatabase = gffutils.FeatureDB(gffDbFile)

    genes = gffDatabase.features_of_type('gene')
    for geneObj in genes:
        event = geneObj.attributes['ID']

        if "_" in geneObj.chrom:
            continue

        info[event]['chromosome'] = geneObj.chrom
        info[event]['strand'] = geneObj.strand
        spliceType = geneObj.source
        if spliceType != "SE":
            raise Exception

        inclusionIso, exclusionIso = [
            i for i in gffDatabase.children(geneObj, featuretype='mRNA')
        ]
        inclusionExons, exclusionExons = [
            list(i)
            for i in map(gffDatabase.children, [inclusionIso, exclusionIso])
        ]

        inclusionJxns, exclusionJxns = map(get_junctions,
                                           [inclusionExons, exclusionExons])
        info[event]["BODY"] = inclusionExons[1]
        if geneObj.strand == "+":
            info[event]["UP"] = inclusionExons[0]
            info[event]["DOWN"] = inclusionExons[2]
        else:
            info[event]["UP"] = inclusionExons[2]
            info[event]["DOWN"] = inclusionExons[0]
        info[event]['start'] = inclusionIso.start
        info[event]['end'] = inclusionIso.stop
        info[event]['inclusionJxns'] = inclusionJxns
        info[event]['exclusionJxns'] = exclusionJxns
    return info