Exemplo n.º 1
0
    def preDump(self):
	self.currentKey = None
	self.currentCluster = []
	self.dsd = DataSetDumper(self.context)
	self.dsId = self.dsd.dataSet(name=self.DATASETNAME)
	return True
Exemplo n.º 2
0
class AnnotationDumper(AbstractItemDumper):
    QTMPLT = [
	#
	# Get data for each annotation.
	#
        '''
	SELECT 
	    va._annottype_key, 
	    va._annot_key, 
	    vat._vocab_key, 
	    va._term_key, 
	    qt.term AS qualifier,
	    aa.accid AS identifier, 
	    va._object_key
	FROM 
	    VOC_Annot va, 
	    VOC_Term vt, 
	    VOC_Term qt, 
	    VOC_AnnotType vat, 
	    VOC_Vocab vv, 
	    ACC_Accession aa
	WHERE va._annottype_key in (%(ANNOTTYPEKEYS)s)
	AND va._qualifier_key = qt._term_key
	AND va._annottype_key = vat._annottype_key
	AND va._term_key = vt._term_key
	AND vt._term_key = aa._object_key
	AND aa._mgitype_key = %(TERM_TYPEKEY)d
	AND aa.preferred = 1
	AND aa.private = 0
	AND vt._vocab_key = vv._vocab_key
	AND aa._logicaldb_key = vv._logicaldb_key
	%(LIMIT_CLAUSE)s
	''',
	#
	# Evidence code vocabulary terms.
	# WARNING: This query will return dup terms if > 1 annotation type
	# uses the same evidence vocab. (Necessary because we need the annot
	# type key in the results.)
	#
	'''
	SELECT 
	    v._term_key, 
	    v.abbreviation AS code, 
	    v.term, 
	    vat._annottype_key
	FROM 
	    VOC_Term v, 
	    VOC_AnnotType vat
	WHERE vat._evidencevocab_key = v._vocab_key
	AND vat._annottype_key in (%(ANNOTTYPEKEYS)s)
	%(LIMIT_CLAUSE)s
	''',
	#
	# Get annotation evidence info, e.g., evidence code, reference, etc.
	#
	'''
	SELECT 
	    ve._annotevidence_key, 
	    ve._annot_key, 
	    va._annottype_key,
	    ve._evidenceterm_key,
	    ve.inferredfrom, 
	    ve._refs_key
	FROM 
	    VOC_Evidence ve, 
	    VOC_Annot va 
	WHERE ve._annot_key = va._annot_key
	AND va._annottype_key in (%(ANNOTTYPEKEYS)s)
	%(LIMIT_CLAUSE)s
	''']

    ITMPLT = [
	# OntologyAnnotation
	# The actual class name is a parameter (to accommodate GO-annotations,
	# which are their own subclasses in the standard model).
	# Do not explicitly set evidence collection (let reverse refs take care of that)
	# References collection is not yet implemented.
	# DataSets collection not yet implemented
	'''
	<item class="%(class)s" id="%(id)s">
	  <reference name="ontologyTerm" ref_id="%(ontologyterm)s"/>
	  <reference name="subject" ref_id="%(subject)s"/>
	  %(qualifier)s
	  <collection name="dataSets">%(dataSets)s</collection>
	  </item>
	''',
	#
	# OntologyAnnotationEvidenceCode
	#
	'''
	<item class="%(class)s" id="%(id)s">
	  <attribute name="code" value="%(code)s"/>
          <attribute name="name" value="%(name)s"/>
	  </item>
	''',
	#
	# OntologyAnnotationEvidence
	#
	'''
	<item class="%(class)s" id="%(id)s">
	  <reference name="annotation" ref_id="%(annotation)s"/>
	  %(inferredfrom)s
	  <reference name="code" ref_id="%(code)s"/>
	  <collection name="publications">%(publications)s</collection>
	  %(annotationExtension)s
	  <collection name="baseAnnotations">%(baseAnnotations)s</collection>
          <collection name="comments">%(comments)s</collection>
	  </item>
	''',
	]

    def __init__(self, ctx):
        AbstractItemDumper.__init__(self,ctx)
	self.atk2classes = { 
	  # FIXME. Hardcoded annotation-type data. This really needs refactoring!
	  # annottype_key : 
	  #    ( annotated obj type, 
	  #      voc term class, 
	  #      voc evidence class, 
	  #      evidence code class, 
	  #      species, 
	  #      loadProperties )
	  #
	  # Mouse marker-GO annotations
	  1000 : ('GOTerm to Mouse Feature Annotations from MGI',
	          'Marker', 'GOTerm','GOAnnotation','GOEvidence','GOEvidenceCode','Mouse', False),
	  # Mouse genotype-MP annotations
	  1002 : ('MPTerm to Mouse Genotype Annotations from MGI',
	          'Genotype', 'MPTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Mouse', True),
	  # Mouse genotype-OMIM annotations
	  1005 : ('DiseaseTerm to Mouse Genotype Annotations from MGI',
	          'Genotype', 'OMIMTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Mouse', False),
	  # Human gene-OMIM annotations
	  1006 : ('DiseaseTerm to Human Feature Annotations from MGI',
	          'Marker', 'OMIMTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Human', False),
	  # Mouse allele-OMIM annotations
	  1012 : ('DiseaseTerm to Mouse Allele Annotations from MGI',
	          'Allele', 'OMIMTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Mouse', False),
	  # Mouse marker-derived MP annotation
	  1015 : ('MPTerm to Mouse Feature Annotations from MGI',
	          'Marker', 'MPTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Mouse', True),
	  # Mouse marker-derived OMIM annotation
	  1016 : ('DiseaseTerm to Mouse Feature Annotations from MGI',
	          'Marker', 'OMIMTerm','OntologyAnnotation','OntologyAnnotationEvidence','OntologyAnnotationEvidenceCode','Mouse', True),
	}
	self.ANNOTTYPEKEYS = self.atk2classes.keys()
	self.ANNOTTYPEKEYS_S = COMMA.join(map(lambda x:str(x),self.ANNOTTYPEKEYS))
        self.context.QUERYPARAMS['ANNOTTYPEKEYS'] = self.ANNOTTYPEKEYS_S
	self.ANNOTTYPEKEYS_PROPS = filter( lambda k:self.atk2classes[k][7], self.ANNOTTYPEKEYS ) # keys where loadProperties is true

    def preDump(self):
	#
	# Keep track of ontology terms referenced by the annotations.
	# (Since ontologies are loaded seperately, here we need
	# to just write out stubs for the OntologyTerm items.)
	#
	self.termsToWrite = set() # the terms we need to create stubs for
	self.assignedkeys = {}	# identifier -> key (eg: 'GO:123456' -> '10013_1001')
	self.loadEvidenceProperties()
	self.writeDataSets()

    def writeDataSets(self):
	self.dsd = DataSetDumper(self.context)
	self.atk2dsid = {}
	for atk, atinfo in self.atk2classes.items():
	    dsname = atinfo[0]
	    self.atk2dsid[atk] = self.dsd.dataSet(name=dsname)

    #
    # Loads the evidence property records for the specified annotation types.
    # Creates an index from annotation key to the annotation extension (a string).
    #
    # In MGI, annotation extensions (VOC_Evidence_Property records) are attached to 
    # evidence records, but in Intermine, they are  associated with
    # Annotation objects. 
    #
    # RIGHT NOW ONLY WORKS FOR MP ANNOTATIONS (annotationtype key=1002)!!! 
    # GO annotations (key=1000) also have property records
    # in MGI, but the translation into GAF annotation extensions is complex,
    # See TR11112. We are not loading these yet.
    # 
    # No other annotations in MGI have property records at this time.
    #
    def loadEvidenceProperties(self):
	self.ek2props = {}
        q = '''
            select 
	        a._annottype_key, 
		a._annot_key, 
		p._annotevidence_key, 
		p.stanza, 
		p.sequencenum, 
		t.term, 
		p.value
            from 
	        voc_evidence_property p, 
		voc_term t, 
		voc_evidence e, 
		voc_annot a
            where p._propertyterm_key = t._term_key
            and p._annotevidence_key = e._annotevidence_key
            and e._annot_key = a._annot_key
	    and a._annottype_key in (%s)
            order by p._annotevidence_key, p.stanza, p.sequencenum
        ''' % (','.join(map(lambda x:str(x), self.ANNOTTYPEKEYS_PROPS)))
        for r in self.context.sql(q):
	    atk = r['_annottype_key']
	    if atk == 1002:
		v = r['value'].upper()
		if v in "MF":
		    ek = r['_annotevidence_key']
		    self.ek2props[ek] = 'specific_to(%s)' % (v == 'M' and 'male' or 'female')
	    elif atk == 1015 or atk == 1016:
		if r['term'] == "_SourceAnnot_key":
		    self.ek2props.setdefault(r['_annotevidence_key'],[]).append(int(r['value']))

    def processRecord(self, r, iQuery):
	atk = r['_annottype_key']
	dsname, tname, oclass, aclass, aeclass, aecclass, aspecies, ahasprops = self.atk2classes[atk]
	if iQuery == 0:
	    # OntologyAnnotation
	    r['id'] = self.context.makeItemId('OntologyAnnotation', r['_annot_key'])
	    r['subject'] = self.context.makeItemRef(tname, r['_object_key'])
	    r['qualifier'] = r['qualifier'] and ('<attribute name="qualifier" value="%(qualifier)s"/>' % r) or ''
	    r['class'] = aclass
	    r['dataSets'] = '<reference ref_id="%s"/>'%self.atk2dsid[atk]

	    identifier = r['identifier']
	    if oclass == 'OMIMTerm':
		r['identifier'] = identifier = "OMIM:"+identifier

	    tk = r['_term_key']
	    # make the reference without checking (because this dumper will
	    # create them later).
	    otermkey = self.context.makeGlobalKey('Vocabulary Term', tk)
	    r['ontologyterm'] = otermkey
	    self.termsToWrite.add( (oclass, otermkey, identifier) )
	    return r
	elif iQuery == 1:
	    # OntologyEvidenceCode
	    try:
		r['id'] = self.context.makeItemId('OntologyAnnotationEvidenceCode', r['_term_key'])
	    except DumperContext.DuplicateIdError:
	        return
	    else:
		r['class'] = aecclass
		r['name'] = r['term']
		return r
	else:
	    # OntologyAnnotationEvidence
	    r['id'] = self.context.makeItemId('OntologyAnnotationEvidence', r['_annotevidence_key'])
	    r['class'] = aeclass
	    r['code'] = self.context.makeItemRef('OntologyAnnotationEvidenceCode', r['_evidenceterm_key'])
	    r['annotation'] = self.context.makeItemRef('OntologyAnnotation', r['_annot_key'])
	    r['inferredfrom'] = r['inferredfrom'] and ('<attribute name="withText" value="%(inferredfrom)s"/>'%r) or ''
	    r['publications'] = '<reference ref_id="%s"/>' % \
	        self.context.makeItemRef('Reference', r['_refs_key'])

	    r['baseAnnotations'] = ''
	    r['annotationExtension'] = ''
	    if r['_annottype_key'] == 1002:
		p = self.ek2props.get(r['_annotevidence_key'])
		p = p and '<attribute name="annotationExtension" value="%s" />'%p or ''
		r['annotationExtension'] = p
	    elif r['_annottype_key'] in [1015,1016]:
		ps = self.ek2props.get(r['_annotevidence_key'],[])
		refs = [ self.context.makeItemRef('OntologyAnnotation', k) for k in ps ]
		refs2 = [ '<reference ref_id="%s"/>'%ref for ref in refs ]
		r['baseAnnotations'] = ''.join(refs2)

            r['comments'] = ''.join(self.context.annotationComments.get(r['_annotevidence_key'],[]))
	    return r

    def postDump(self):
	# Write out stub items for OntologyTerms.
	tmplt = '''<item class="%(class)s" id="%(id)s"> <attribute name="identifier" value="%(identifier)s" /> </item>\n'''
        for t in self.termsToWrite:
	    r = { 'class':t[0], 'id':t[1], 'identifier':t[2] }
	    self.writeItem(r, tmplt)

	# switch output files
	self.context.openOutput("DerivedAnnotations.xml")

	# need one more evidence code 
	c = {}
	c['id'] = self.context.makeItemId('OntologyAnnotationEvidenceCode')
	c['class']= "OntologyAnnotationEvidenceCode"
	c['code'] = "DOA"
	c['name'] = "derived from other annotations"
	self.writeItem(c, self.ITMPLT[1])

	# compute and write out derived annotations
	dsref = self.dsd.dataSet(name="DiseaseTerm to Mouse Allele Annotations from MGI")
	helper = DerivedAnnotationHelper(self.context)

	def writeDerivedAnnot( type, k, vk, tk, arks ):
	    # Args:
	    #	type	"Marker" or "Allele"
	    #	k	its MGI database key
	    #	vk	vocabulary key
	    #	tk	term key
	    #	arks	object containing: a set of annotation keys (annots),
	    #		a set of reference keys (refs), and (if applicable) an
	    #		existing annotation key
	    # NOTE: helper data is gotten from the MGI independently. Possible that some of the
	    # objects might have been skipped by the dumper code prior to this. 
	    # Therefore handle dangling reference errors by skipping the record..
	    # 
	    try:
		#
		r = {}
		if arks['existing']:
		    r['id'] = self.context.makeItemRef('OntologyAnnotation', arks['existing'])
		else:
		    r['id'] = self.context.makeItemId('OntologyAnnotation') # start auto-assigning.
		    r['class'] = "OntologyAnnotation"
		    r['subject'] = self.context.makeItemRef(type, k)
		    r['ontologyterm'] = self.context.makeItemRef('Vocabulary Term', tk)
		    r['qualifier'] = ''
		    r['dataSets'] = '<reference ref_id="%s"/>' % dsref
		#
		s = {}
		s['id'] = self.context.makeItemId('OntologyAnnotationEvidence') # start auto-assigning
		s['class'] = 'OntologyAnnotationEvidence'
		s['annotation'] = r['id']
		s['inferredfrom'] = ''
		s['code'] = c['id']
		#
		ars = []
		for ak in arks['annots']:
		    try:
			ar = self.context.makeItemRef("OntologyAnnotation", ak)
			ars.append('<reference ref_id="%s" />'%ar)
		    except DumperContext.DanglingReferenceError:
		        pass
		s['baseAnnotations'] = ''.join(ars)
		#
		rrs = []
		for rk in arks['refs']:
		    try:
			rr = self.context.makeItemRef("Reference", rk)
			rrs.append('<reference ref_id="%s" />'%rr)
		    except DumperContext.DanglingReferenceError:
		        pass
		s['publications'] = ''.join(rrs)
		s['annotationExtension'] = ''

                s['comments'] = ''

		#
	    except DumperContext.DanglingReferenceError:
	        pass
	    else:
		if not arks['existing']:
		    self.writeItem(r, self.ITMPLT[0])
		self.writeItem(s, self.ITMPLT[2])
	    
	#for (mk, vk, tk, arks) in helper.iterAnnots("Marker"):
	#    writeDerivedAnnot("Marker", mk, vk, tk, arks)
	for (ak, vk, tk, arks) in helper.iterAnnots("Allele"):
	    writeDerivedAnnot("Allele", ak, vk, tk, arks)
Exemplo n.º 3
0
class HomologyDumper(AbstractItemDumper):

    DATASETNAME = "Mouse/Human Orthologies from MGI"

    QTMPLT = '''
	SELECT mc._cluster_key, mm._marker_key, mm.symbol, mm._organism_key
	FROM mrk_cluster mc, mrk_clustermember mcm, mrk_marker mm
	WHERE mc._cluster_key = mcm._cluster_key
	AND mcm._marker_key = mm._marker_key
	AND mc._clustersource_key = %(MGI_HYBRID_HOMOLOGY_KEY)d
	ORDER BY mc._cluster_key
	'''

    ITMPLT = '''
	<item class="Homologue" id="%(id)s">
	<attribute name="type" value="%(type)s" />
	<reference name="gene" ref_id="%(gene)s" />
	<reference name="homologue" ref_id="%(homologue)s" />
	<collection name="dataSets"><reference ref_id="%(dataSet)s"/></collection>
	</item>
	'''

    def preDump(self):
	self.currentKey = None
	self.currentCluster = []
	self.dsd = DataSetDumper(self.context)
	self.dsId = self.dsd.dataSet(name=self.DATASETNAME)
	return True

    def flushOnePair(self, a, b):
	htype = "paralogue" if a['_organism_key'] == b['_organism_key']  else "orthologue"
	r = {
	    "id" : self.context.makeItemId("Homologue"),
	    "type" : htype,
	    "gene" : self.context.makeItemRef("Marker", a['_marker_key']),
	    "homologue" : self.context.makeItemRef("Marker", b['_marker_key']),
	    "dataSet" : self.dsId
	    }
	self.writeItem(r, self.ITMPLT)
        
    def flush(self):
	for i in range(len(self.currentCluster)):
	    for j in range(i+1, len(self.currentCluster)):
	        self.flushOnePair(self.currentCluster[i], self.currentCluster[j])
	        self.flushOnePair(self.currentCluster[j], self.currentCluster[i])

    def processRecord(self, r):
	cck = r['_cluster_key']
	if cck == self.currentKey:
	    self.currentCluster.append(r)
	else:
	    if self.currentKey:
	        self.flush()
	    self.currentKey = cck
	    self.currentCluster = [ r ]
	return None

    def postDump(self):
	self.context.log("PostDump:"+str(self.currentCluster))
	if self.currentKey:
	    self.flush()
Exemplo n.º 4
0
    def writeDataSets(self):
	self.dsd = DataSetDumper(self.context)
	self.atk2dsid = {}
	for atk, atinfo in self.atk2classes.items():
	    dsname = atinfo[0]
	    self.atk2dsid[atk] = self.dsd.dataSet(name=dsname)