infile = a elif o == "-o": outfile = a promotorUp = 2000 promotorDown = 2000 intervals = csv.reader(open(infile, "r"), delimiter="\t") writer = csv.writer(open(outfile, "w"), delimiter="\t") cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) genome = Genome() # load gene data genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") headerRow = [ 'Ensembl', 'Name', 'chr', 'start', 'stop', 'strand', 'No. Transcripts', 'Avg. Exons per Transcript', "Unique Exons per Gene", "Start positions", "Start positions / No. Transcripts" ] #"Promotor G-Count", "Promotor C-Count", "Promotor A-Count", "Promotor T-Count" , headerRow.extend([ "Promotor region start", "Promotor region end", "Promotor GC Percent",
assembly = a assert methdatafile != None # if we have an expression file we need fc and expression columns if exprfile != None: assert fccol != None assert exprcols != None else: exprcols = [] assert outputfile != None genedata = EnsemblGenes(assembly=assembly) genome = Genome(genomeBuild=assembly) if assembly == "hg18": cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands.bed")) lINEs = ExtendedBed( os.path.expanduser("~/mount/publicdata/hg18/repeats/LINEs-0.bed")) sINEs = ExtendedBed( os.path.expanduser("~/mount/publicdata/hg18/repeats/SINEs-0.bed")) elif assembly == "hg19": cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg19/CpGIslands/cpgislands.bed")) lINEs = ExtendedBed( os.path.expanduser(
exprfile = None ensemblidcol = "ensemblid" upstreamPromotor = 5000 downstreamPromotor = 1000 for o,a in opts: if o=="--gene-expression-file": exprfile = a assert exprfile != None print len(matrices) # WARNING: everything we just read in are pfm (position frequency), we might need position weight matrices # We have choice of JASPER (downloaded), JASPER (with motility) or TFD or combining them genedata = EnsemblGenes(assembly="hg18") genome = Genome(genomeBuild = "hg18") exprCSV = IndexedCSV(exprfile,keyPos=1) motifnames = [] for matrix in matrices: motifnames.append(matrix) output = csv.writer(open(exprfile+".transcriptionfactors","w"),delimiter='\t') # header header = exprCSV.keys[:] # copy keys header.extend(motifnames) output.writerow(header) for testid in exprCSV:
if o == "-o" or o == "--outputFolder": outputfolder = a assert infile != None assert outputfolder != None def getBlatLocation(line): result = line.split("\t") return int(result[0]), result[13], int(result[15]), int( result[16]), result[8] def getBlatQStarts(line): result = line.split("\t") return [int(y) for y in result[19].split(",")[:-1]] genome = Genome(genomeBuild=build) makeDirectory(outputfolder) with open(outputfolder + "/index.html", "w") as indexFile: print >> indexFile, """ <html><body> <head> <style type="text/css"> td{font-size:small;} th{font-size:small;} </style> </head> """
def __init__(self,build): self.genome = Genome(genomeBuild = build) self.valuesBehaviour = missingValuesDontCount self.chromosomeEnds = ChromosomeEnds(build)
else: regions.append(BedIntervalTree(a)) elif o == "-c": controlAffyExpressionData = IndexedCSV(a) #Annotated difference file input elif o == "-e": rnaSeqExpressionData = IndexedCSV(a, key="test_id") elif o == "-a": assembly = a UPSTREAM_PROMOTOR_DIST = 2000 DOWNSTREAM_PROMOTOR_DIST = 2000 writer = csv.writer(open(outfile, "w"), delimiter="\t") genome = Genome(assembly) ### # load data genedata = Ensembl.EnsemblGenes(assembly=assembly) genes = Ensembl.ReverseGeneMapping(genedata) genespluspromotor = Ensembl.ReverseGeneMapping( genedata, tssPadding=UPSTREAM_PROMOTOR_DIST) genepromotors = Ensembl.ReversePromotorMapping( genedata, upstreamPadding=UPSTREAM_PROMOTOR_DIST,
affypcol = a elif o == "--outputfile": outputfile = a elif o == "--promotorsize": upstreamPromotor = int(a) downstreamPromotor = int(a) assert methdatafile != None assert affyfile != None assert affyfccol != None assert affyexprcol != None assert outputfile != None genedata = EnsemblGenes(assembly="hg18") genome = Genome(genomeBuild="hg18") affyannotation = NetAffxAnnotation(genome="hg18", cdfname="HG-U133_Plus_2") cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) affyCSV = IndexedCSV(affyfile) affyEnsemblLogFCs = collections.defaultdict(list) affyEnsemblExprs = collections.defaultdict(list) affyEnsemblPvalues = collections.defaultdict(list) for affy in affyCSV: ensembls = affyannotation.getValues(affy, "Ensembl") if len(ensembls) == 1:
assert infile != None assert outputfolder != None def getBlatLocation(line): result = line.split("\t") return int(result[0]), result[13], int(result[15]), int( result[16]), result[8] def getBlatQStarts(line): result = line.split("\t") return [int(y) for y in result[19].split(",")[:-1]] exonboundaries = False blockStarts = [] genome = Genome(genomeBuild=build) makeDirectory(outputfolder) shutil.copy("arrow-down.gif", outputfolder + "/arrow-down.gif") shutil.copy("arrow-none.gif", outputfolder + "/arrow-none.gif") shutil.copy("arrow-up.gif", outputfolder + "/arrow-up.gif") shutil.copy("sortable.css", outputfolder + "/sortable.css") shutil.copy("sortable.js", outputfolder + "/sortable.js") with open(outputfolder + "/index.html", "w") as indexFile: print >> indexFile, """ <html><body> <head> <style type="text/css">
### def distanceHumanReadable(dist): return str(dist / 1000) + "kb" TSS_TTS_Distance = 1000 SURROUNDING_SEQUENCE_Distance = 250 # each side WINDOW_SIZE = 500 WINDOW_OFFSET = 5 # load data genome = Genome(genomeBuild="hg18") chromosomeEnds = ChromosomeEnds("hg18") genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") genes = Ensembl.ReverseGeneMapping(genedata) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata) cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))