# takes in a csv file of intervals and tells us some stuff about them for o, a in opts: if o == "-i": infile = a elif o == "-o": outfile = a promotorUp = 2000 promotorDown = 2000 intervals = csv.reader(open(infile, "r"), delimiter="\t") writer = csv.writer(open(outfile, "w"), delimiter="\t") cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) genome = Genome() # load gene data genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") headerRow = [ 'Ensembl', 'Name', 'chr', 'start', 'stop', 'strand', 'No. Transcripts', 'Avg. Exons per Transcript', "Unique Exons per Gene", "Start positions", "Start positions / No. Transcripts" ] #"Promotor G-Count", "Promotor C-Count", "Promotor A-Count", "Promotor T-Count" , headerRow.extend([
# if we have an expression file we need fc and expression columns if exprfile != None: assert fccol != None assert exprcols != None else: exprcols = [] assert outputfile != None genedata = EnsemblGenes(assembly=assembly) genome = Genome(genomeBuild=assembly) if assembly == "hg18": cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands.bed")) lINEs = ExtendedBed( os.path.expanduser("~/mount/publicdata/hg18/repeats/LINEs-0.bed")) sINEs = ExtendedBed( os.path.expanduser("~/mount/publicdata/hg18/repeats/SINEs-0.bed")) elif assembly == "hg19": cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg19/CpGIslands/cpgislands.bed")) lINEs = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg19/Repeats/UCSC_HG19_LINEs.bed"), defaultkeys=["chrom", "chromStart", "chromEnd", "name", "strand"]) sINEs = ExtendedBed( os.path.expanduser(
genespluspromotor = Ensembl.ReverseGeneMapping( genedata, tssPadding=UPSTREAM_PROMOTOR_DIST) genepromotors = Ensembl.ReversePromotorMapping( genedata, upstreamPadding=UPSTREAM_PROMOTOR_DIST, downstreamPadding=DOWNSTREAM_PROMOTOR_DIST) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata) # UCSC table browser - Expression & Regulation - CpG Islands # Download all columns with exception of "bin" cpgIslands = ExtendedBed( os.path.expanduser("/mnt/50tb/publicdata/" + assembly + "/CpGIslands/cpgislands.bed")) # UCSC table browser - Mapping and Sequencing - Chromosome Bands # Download all columns with exception of "gieStain" gBanding = ExtendedBed(os.path.expanduser("/mnt/50tb/publicdata/" + assembly + "/G-Banding/cytogenetic.map.bed"), defaultkeys=["chrm", "start", "stop", "band"], forcekeys=True) chromosomeEnds = ChromosomeEnds(assembly) ### ### ###
from bed.treatment import ExtendedBed from sam.SamFormat import SAMFile import os import sys lads = ExtendedBed( os.path.expanduser("~/mount/privatedata/non-Adams/donahue.greg/LADs.bed")) alignments = SAMFile(os.path.expanduser(sys.argv[1])) numbInLAD = 0 numbNotInLAD = 0 currentSeq = None def previousKey(key, isin, isnotin): print key, isin, isnotin, isin / float(isin + isnotin) for samEntry in alignments: if samEntry.chrm == "*": continue if currentSeq != None and samEntry.key != currentSeq: # print the stats on the previous key previousKey(currentSeq, numbInLAD, numbNotInLAD) numbInLAD = 0 numbNotInLAD = 0
downstreamPromotor = int(a) assert methdatafile != None assert affyfile != None assert affyfccol != None assert affyexprcol != None assert outputfile != None genedata = EnsemblGenes(assembly="hg18") genome = Genome(genomeBuild="hg18") affyannotation = NetAffxAnnotation(genome="hg18", cdfname="HG-U133_Plus_2") cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) affyCSV = IndexedCSV(affyfile) affyEnsemblLogFCs = collections.defaultdict(list) affyEnsemblExprs = collections.defaultdict(list) affyEnsemblPvalues = collections.defaultdict(list) for affy in affyCSV: ensembls = affyannotation.getValues(affy, "Ensembl") if len(ensembls) == 1: affyFC = float(affyCSV[affy][affyfccol]) affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log( abs(affyFC)) * -1.0 affyEnsemblLogFCs[ensembls[0]].append(affylogFC)
Small_TSS_TTS_Distance = 1000 Small_TTS_TTS_Distance_Human = str(Small_TSS_TTS_Distance / 1000) + "kb" # load data genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") genes = Ensembl.ReverseGeneMapping(genedata) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata) cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) affyannotation = NetAffxAnnotation() paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance) def isUpstream(distance, strand): if strand == "+": return 'Y' if distance >= 0 else 'N' elif strand == "-": return 'Y' if distance <= 0 else 'N' else: # wtf went wrong here exit(-1)
# load data genome = Genome(genomeBuild="hg18") chromosomeEnds = ChromosomeEnds("hg18") genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") genes = Ensembl.ReverseGeneMapping(genedata) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata) cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) affyannotation = NetAffxAnnotation() paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance) # store full mapping here for the end geneToMethProbeMapping = collections.defaultdict(list) geneToAffyProbeMapping = collections.defaultdict(list) def isUpstream(distance, strand): assert strand in ['+', '-'] if strand == "+": return distance >= 0