Exemplo n.º 1
0
def updateExcludedRegions(path,tfName,window):
    """mark motifs if it overlaps ENCODE excluded regions"""
    for infile in glob.glob(os.path.join(path,"*.bed.gz")):
	(regpath,regfilename) = os.path.split(infile)
	expName = regfilename.split('.')[0]
	with gzip.open(infile,'rt') as bedFile:
	    bed = csv.reader(bedFile, delimiter = '\t') 
	    annoIntvlDict = countBed.getBed6Anno(bed,expName)
	    intervalDict = countBed.sortInterval(annoIntvlDict)
	    cursor = mcollection.find({"tf_name": tfName})
	    for test in cursor:
		motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], \
						test["motif_genomic_regions_info"]["start"], \
						test["motif_genomic_regions_info"]["end"]
	        regionList, valueList = countBed.getMotifAnno(annoIntvlDict,
				intervalDict,motifChrom,motifStart,motifEnd,window)
		if regionList != []:
			print regionList, valueList, motifChrom, motifStart, motifEnd
		mcollection.update({"_id": test["_id"]}, 
			{"$set": {"motif_mapability_info":{"exclude": regionList}}}, upsert = True)
	    	
    return 0
Exemplo n.º 2
0
def main(argv):
    if len(argv) < 3:
        sys.stderr.write("Usage: %s motif_tf_info_file path-to-fimo-output\n" % argv[0])
        return 1
    if not os.path.isfile(argv[1]):
        sys.stderr.write('Error: motif_info_file %r was not found!\n' % argv[1])
        return 1
    if not os.path.exists(argv[2]):
        sys.stderr.write('Error: path-to-fimo-output %r was not found!\n' % argv[2])
        return 1 

    server = 'localhost'
    port = 27017
    client = MongoClient(server, port)
    c = Connection()
    db = c["mm9"]
    db = client["mm9"]
    global mcollection 
    mcollection = db["motif_instance_hughes_test"]
    ##drop collection
    #c["mm9"].drop_collection("motif_instance_hughes_test")
    #mcollection.remove()
    #db.drop_collection('motif_instance_hughes_test')
    #mcollection = db["motif_instance_hughes_test"]
    #print 'clean', mcollection.count()
    #index collections
#    mcollection.ensure_index("motif_id",name="m_id",unique=False,background=True)
#    mcollection.ensure_index("tf_name",name="tf_name",unique=False,background=True)
    #collection.ensure_index("motif_type",name="motif_type",unique=True,drop_dups=True,background=True)
#    mcollection.ensure_index("motif_gene_mapping_info.genelist10kb",name = "target_gene",unique=False,background=True)
#    mcollection.ensure_index("motif_gene_mapping_info.closest_gene",name = "closest_gene",unique=False,background=True)
#    mcollection.ensure_index("motif_score", name = "motif_score", unique = False, background = True)
#    mcollection.ensure_index("motif_tf_info.motif_type", name = "motif_type", unique = False, background = True)
#    mcollection.ensure_index("motif_tf_info.msource_type", name = "msource_type", unique = False, background = True)
#    mcollection.ensure_index("motif_tf_info.tf_status", name = "tf_status", unique = False, background = True)
#    mcollection.ensure_index("motif_tf_info.msource_id", name = "project_name", unique = False, background = True)
#    mcollection.ensure_index("motif_tf_info.family_name", name = "family_name", unique = False, background = True)
#    mcollection.ensure_index([("tf_name", ASCENDING),
#			      ("motif_gene_mapping_info.genelist10kb", DESCENDING)],
#				name="network_edge", unique=False, background=True)
    #index genomic regions
    #mcollection.ensure_index([("motif_genomic_regions_info.chr",DESCENDING),
#				("motif_genomic_regions_info.start",DESCENDING),
				#("motif_genomic_regions_info.end",ASCENDING),
#				("motif_genomic_regions_info.strand",DESCENDING)],
#				name="genomic_regions",unique=False,background=True)
    #print collection
    #print 'done indexing'
    infile = sys.argv[1]#'/home/xc406/data/hg19motifs90/TF_Information90hg19.txt'
    path = sys.argv[2]

    ifile = open(infile,'rt')
    tf_info = csv.reader(ifile, delimiter = '\t')
    #mlist = []
    for row in tf_info:
	try:
		dbd_count = int(row[11])
	except ValueError:
		dbd_count = None
	try:
		msource_year = int(row[18])
	except ValueError:
		msource_year = None
	#print row
	motif_instance = {
		"motif_id": row[3],
		"tf_name": row[6],
		"motif_score": None,
		"motif_tf_info":{	
			"species_name": row[7],
			"tf_status": row[8], #direct or indirect
			"family_name": row[9],
			"dbds": row[10],
			"dbd_count": dbd_count,
			"dbid": row[12],
			"motif_type": row[14],
			"msource_id": row[15],
			"msource_type": row[16],
			"msource_author": row[17],
			"msource_year": msource_year,
			"pmid": row[19] #citation
		},
		"motif_genomic_regions_info":{
			"chr": None,
			"start": None,
			"end": None,
			"strand": None
		},
		"motif_mapability_info":{
				"exclude": [],
				"score": [],
				"gc_content": None
		},
		"motif_cons_info":{
				"phylop_euarchontoglires": None,
				"phylop_mammals": None,
				"phylop_vertebrate": None,
				"phastCons_euarchontoglires": None,
				"phastCons_mammals": None,
				"phastCons_vertebrate": None
				#"SNP_diversity": None,
				#"Indel_diversity": None
		},
		"motif_gene_mapping_info":{
			"closest_gene": None,
			"feature": None,#intergenic or 3' 5'
			"dist_tss": None,
			"genelist10kb": [],##list of gene--center of motif fall in gene plus and minus 10kb
			"transcriptidlist10kb": []
			#"epu_id": #boolean		
		},
		"motif_ct_info":{
			"ct_name": None,
			"ct_type": None, #normal/cancerous/cellline/primarycell
			"accessibility_score": {}, #{type: log likelihood score}
			#"accessibility_type": [], #dhs, dgf, faire
			"chip_score": {}, #tf: pval(overlapping peaks)
			"h3k4me3_score": None,
			"h3k4me1_score": None,
			"h3k27ac_score": None,
			"p300_score": None,
			"pol2_score": None
		},	
	}

	#mlist.append(motif_instance)
    	#print len(mlist)

 #       try:	
		#print motif_instance	
#		mcollection.insert(motif_instance)
#		del motif_instance
		#print 'inserted' #collection
 #   	except DuplicateKeyError:
		#print 'dup'
#		pass

    #cursor = mcollection.find()
    print 'before entering genomic region info ', mcollection.count()
    #c = iter(cursor)

    ##clean overlapping motif entries
    startTime = time.clock()
    #updateMotifGenomicRegions(fimopath)
    
    #getCount(wigpath,"Hes5")

    ##write gff
    #cursor = mcollection.find({"tf_name": "Stat3"})
    #print 'updated count ', cursor.count(), 'total count after update', mcollection.count(), 'update time', time.time() - startTime
    #ofile = open('/home/xc406/data/mongodbtest/test.gff','wt')
    #gffWriter = csv.writer(ofile, delimiter='\t')
#    makeGff(cursor,gffWriter,0)

    ##update gene features
    refSeqFile = open('/home/xc406/data/mm9_refseq_June_2014.txt','rt')
    refSeqReader = csv.reader(refSeqFile, delimiter='\t')
    tssDict, geneNameDict, geneRangeDict = getRefSeqDict(refSeqReader)
    cursor = mcollection.find({"tf_name":"Zscan4"})
    intervalDict = countBed.sortInterval(geneRangeDict)
    startTime2 = time.clock()
    #with client.start_request():##open update
    #print "Zic1 motif: {0}".format(test)
    for test in cursor:
        motifChrom, motifStart, motifEnd = test["motif_genomic_regions_info"]["chr"], test["motif_genomic_regions_info"]["start"], test["motif_genomic_regions_info"]["end"]
    	#print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0]
    	#print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1]
    	#print test["motif_id"], test["motif_genomic_regions_info"]["chr"],test["motif_genomic_regions_info"]["start"]
	#startTime = time.time()
	t = getTargetGene(geneRangeDict,intervalDict,motifChrom, motifStart, motifEnd, 10000)
	#endTime1 = time.time()
	#print "all target mapping", t, motifChrom, motifStart, motifEnd, endTime1 - startTime
#	closest = closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)
	#endTime2 = time.time()
	#print "closest gene mapping time", closest, endTime2 - endTime1 
#    mcollection.update({"motif_id": test["motif_id"], "motif_genomic_regions_info":{"chr": test["motif_genomic_regions_info"]["chr"], 
#			"start": test["motif_genomic_regions_info"]["start"]}},{"$set": 
    #mcollection.update(test,{"$set":{"genomic_regions_gene_mapping":{"closest_gene": closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0], 
#			"dist_tss": closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1]}}})
	#if len(closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0]) > 1:
	    #print closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0], motifChrom, motifStart, motifEnd
 #   	test["motif_gene_mapping_info"]["closest_gene"] = (closest[0],closest[2])#closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[0][0] 
 #  	test["motif_gene_mapping_info"]["dist_tss"] = closest[1]#closestGene(tssDict,geneNameDict,motifChrom,motifStart,motifEnd)[1]
	test["genomic_regions_gene_mapping"]["genelist10kb"] = t[0]
#	test["motif_gene_mapping_info"]["transcriptidlist10kb"] = t[1]#getTargetGene(geneRangeDict,geneNameDict,motifChrom, motifStart, motifEnd, 0)
    	mcollection.save(test)
	#print "Hes5 motif: {0}".format(test)	

    #updateCons(path,"Hes5")
    #updateExcludedRegions(path,"Hes5",0)
    #updateMap(path, "Hes5", 0)
    #print 'update time', time.time() - startTime2
    #cursor = mcollection.find({"tf_name":"Zic1"})
    #makeGff(cursor,gffWriter,0)
    #ofile.close()
    #testupdate = mcollection.find_one({"tf_name":"Zic1"})#{"motif_id": test["motif_id"], 
		#"motif_genomic_regions_info":{"chr": test["motif_genomic_regions_info"]["chr"], "start": test["motif_genomic_regions_info"]["start"]}})
    #print "Zic1 motif: {0}".format(test)
    #print "Zic1 motif update: {0}".format(testupdate)
    print 'total time', time.clock() - startTime