def qtl2(): fp2.write('QTL markers that have mapping but no allele associated (yes/no)\n') fp2.write('mapping = yes/alleles = no\n') results = db.sql('''%s and not exists (select 1 from MRK_Notes n where m._Marker_key = n._Marker_key) and exists (select 1 from MLD_Expt_Marker mld where m._Marker_key = mld._Marker_key) and not exists (select 1 from ALL_Allele al where m._Marker_key = al._Marker_key and al.isWildType = 0) order by symbol ''' % (query1), 'auto') for r in results: fp2.write(r['mgiID'] + TAB) fp2.write(mgi_utils.prvalue(r['refID']) + TAB) fp2.write(r['symbol'] + TAB) fp2.write(r['name'] + TAB) fp2.write(CRT) fp2.write(CRT + '(%d rows affected)' % (len(results)) + CRT) fp2.write('\n\nQTL References with map records that have QTL associated w/o Alleles:\n\n') results = db.sql('''%s and not exists (select 1 from MRK_Notes n where m._Marker_key = n._Marker_key) and exists (select 1 from MLD_Expt_Marker mld where m._Marker_key = mld._Marker_key) and not exists (select 1 from ALL_Allele al where m._Marker_key = al._Marker_key and al.isWildType = 0) order by numericPart ''' % (query2), 'auto') for r in results: fp2.write(mgi_utils.prvalue(r['refID']) + CRT) fp2.write(CRT + '(%d rows affected)' % (len(results)) + CRT)
def writeAccBCP(): ''' # requires: # # effects: # Creates approrpriate BCP records # # returns: # nothing # ''' global accKey, userKey # records that require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 1 ' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write( '%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accrefFile.write( '%d|%s|%s|%s|%s|%s\n' % (accKey, referenceKey, userKey, userKey, loaddate, loaddate)) accKey = accKey + 1 # records that don't require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 0' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write( '%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accKey = accKey + 1
def writeRecord(i, r, e): # if we can't find the DAG for the Term, skip it if not dag.has_key(r['_Term_key']): return # field 1 fp.write(FIELD1 + TAB) # field 2 fp.write(i + TAB) # field 3 fp.write(TAB) # field 4 if r['qualifier'] != None: qualifier = string.strip(r['qualifier']) else: qualifier = '' fp.write(qualifier + TAB) # field 5 fp.write(r['termID'] + TAB) # field 6 fp.write(mgi_utils.prvalue(e[0]) + TAB) # field 7 fp.write(mgi_utils.prvalue(e[1]) + TAB) # field 8 fp.write(mgi_utils.prvalue(e[2]) + TAB) # field 9 fp.write(dag[r['_Term_key']] + TAB) # field 10 fp.write(TAB) # field 11 fp.write(TAB) # field 12 fp.write(FIELD12 + TAB) # field 13 fp.write(TAB) # field 14 fp.write(str(r['mDate']) + TAB) # field 15 fp.write(FIELD15) fp.write(CRT)
def createBCP(): outBCP = open('%s/%s.bcp' % (datadir, table), 'w') print 'sequences1 begin...%s' % (mgi_utils.date()) db.sql('''select s._Object_key as sequenceKey, p._Object_key as probeKey, p._Accession_key INTO TEMPORARY TABLE sequences1 from ACC_Accession s, ACC_Accession p where s._MGIType_key = 19 and lower(s.accID) = lower(p.accID) and p._MGIType_key = 3 and s._LogicalDB_key = p._LogicalDB_key ''', None) db.sql('create index idx2 on sequences1 (sequenceKey)', None) db.sql('create index idx3 on sequences1 (probeKey)', None) db.sql('create index idx4 on sequences1 (_Accession_key)', None) print 'sequences1 end...%s' % (mgi_utils.date()) print 'deletion begin...%s' % (mgi_utils.date()) db.sql('delete from sequences1 using excluded e where sequences1.probeKey = e._Probe_key', None) print 'deletion end...%s' % (mgi_utils.date()) db.commit() print 'sequences2 begin...%s' % (mgi_utils.date()) db.sql('''select s.sequenceKey, s.probeKey, ar._Refs_key as refskey, ar._ModifiedBy_key as userKey, ar.modification_date as mdate INTO TEMPORARY TABLE sequences2 from sequences1 s, ACC_AccessionReference ar where s._Accession_key = ar._Accession_key ''', None) db.sql('create index idx5 on sequences2 (sequenceKey, probeKey, refsKey, userKey, mdate)', None) db.sql('create index idx6 on sequences2 (userKey)', None) db.sql('create index idx7 on sequences2 (mdate)', None) print 'sequences2 end...%s' % (mgi_utils.date()) print 'final begin...%s' % (mgi_utils.date()) results = db.sql('''select distinct sequenceKey, probeKey, refsKey, max(userKey) as userKey, max(mdate) as mdate from sequences2 group by sequenceKey, probeKey, refsKey ''', 'auto') print 'final end...%s' % (mgi_utils.date()) for r in results: outBCP.write(mgi_utils.prvalue(r['sequenceKey']) + DL + \ mgi_utils.prvalue(r['probeKey']) + DL + \ mgi_utils.prvalue(r['refsKey']) + DL + \ r['mdate'] + DL + \ mgi_utils.prvalue(r['userKey']) + DL + mgi_utils.prvalue(r['userKey']) + DL + \ loaddate + DL + loaddate + NL) outBCP.close()
def writeAccBCP(): ''' # requires: # # effects: # Creates approrpriate BCP records # # returns: # nothing # ''' global accKey, userKey # records that require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 1 ' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write('%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accrefFile.write('%d|%s|%s|%s|%s|%s\n' % (accKey, referenceKey, userKey, userKey, loaddate, loaddate)) accKey = accKey + 1 # records that don't require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 0' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write('%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accKey = accKey + 1
def qtl4(): fp4.write('QTL markers that have assigned reference of J:23000 or J:85000\n') results = db.sql(''' select a1.accID as mgiID, a2.accID as refID, m.symbol, m.name from MRK_Marker m, ACC_Accession a1, ACC_Accession a2, refs r where m._Marker_Type_key = 6 and m._Marker_Status_key = 1 and m._Organism_key = 1 and m._Marker_key = a1._Object_key and a1._MGIType_key = 2 and a1._Logicaldb_key = 1 and a1.prefixpart = 'MGI:' and a1.preferred = 1 and m._Marker_key = r._Marker_key and r._Refs_key in (22864, 85477) and r._Refs_key = a2._Object_key and a2._MGIType_key = 1 and a2._Logicaldb_key = 1 and a2.prefixpart = 'J:' and a2.preferred = 1 and not exists (select 1 from MRK_Notes n where m._Marker_key = n._Marker_key) order by symbol ''', 'auto') for r in results: fp4.write(r['mgiID'] + TAB) fp4.write(mgi_utils.prvalue(r['refID']) + TAB) fp4.write(r['symbol'] + TAB) fp4.write(r['name'] + TAB) fp4.write(CRT) fp4.write(CRT + '(%d rows affected)' % (len(results)) + CRT)
def history(): # # history # fp.write('#\n# History\n#\n') results = db.sql(''' select name, history, historyName, event, eventReason, symbol, markerName, edate = convert(char(10), event_date, 101), cdate = convert(char(10), creation_date, 101) from MRK_History_View where _Marker_key = %s order by sequenceNum ''' % (markerKey), 'auto') for r in results: fp.write(string.ljust(r['history'],15) + TAB) fp.write(string.ljust(r['name'],35) + TAB) fp.write(string.ljust(r['historyName'],35) + TAB) fp.write(string.ljust(r['symbol'],15) + TAB) fp.write(string.ljust(r['markerName'],35) + TAB) fp.write(string.ljust(r['event'],15) + TAB) fp.write(string.ljust(r['eventReason'],15) + TAB) fp.write(string.ljust(mgi_utils.prvalue(r['edate']),15) + TAB) fp.write(string.ljust(r['cdate'],15) + CRT)
def goAnnotations(): # # GO annotations # fp.write('#\n# GO annotations\n#\n') results = db.sql(''' select a.accID, a.term, a.qualifier, e.evidenceCode, e.jnumID, e.createdBy, cdate = convert(char(10), a.creation_date, 101) from VOC_Annot_View a, VOC_Evidence_View e where a._AnnotType_key = 1000 and _Object_key = %s and a._Annot_key = e._Annot_key order by a.accID, e.jnumID ''' % (markerKey), 'auto') for r in results: fp.write(string.ljust(r['accID'],15) + TAB) fp.write(string.ljust(mgi_utils.prvalue(r['qualifier']),5) + TAB) fp.write(string.ljust(r['jnumID'],10) + TAB) fp.write(string.ljust(r['evidenceCode'],10) + TAB) fp.write(string.ljust(r['createdBy'],30) + TAB) fp.write(string.ljust(r['term'],100) + TAB) fp.write(string.ljust(r['cdate'],15) + CRT)
def processFile(): ''' # requires: # # effects: # Reads input file # Writes output file # # returns: # nothing # ''' # For each line in the input file for line in inputFile.readlines(): if line[0] == '!': continue tokens = string.split(line[:-1], delim) try: if parseType == 'Library': badName = tokens[0] goodName = tokens[2] else: badName = tokens[1] goodName = tokens[2] except: errorFile.write('Invalid line: %s\n' % (line)) continue if parseType == 'Tissues': results = db.sql('select _Tissue_key from PRB_Tissue where tissue = "%s"' % (goodName), 'auto') elif parseType == 'Cell': results = db.sql('select term from VOC_Term where term = "%s"' % (goodName), 'auto') elif parseType == 'Library': results = db.sql('select _Source_key from PRB_Source where name = "%s"' % (goodName), 'auto') elif parseType == 'Strains': results = db.sql('select a.accID from PRB_Strain_Acc_View a, PRB_Strain s ' + \ 'where s.strain = "%s" ' % (goodName) + \ 'and s._Strain_key *= a._Object_key ' + \ 'and a._LogicalDB_key = 1 ' + \ 'and a.prefixPart = "MGI:" ' + \ 'and a.preferred = 1', 'auto') if len(results) > 0 and badName != goodName: if parseType == 'strain': outputFile.write(mgi_utils.prvalue(results[0]['accID']) + delim + goodName + delim + badName + delim + createdBy + '\n') else: outputFile.write(delim + goodName + delim + badName + delim + createdBy + '\n') elif len(results) == 0: errorFile.write('Invalid good name: %s\n' % (goodName))
def createBCPfile(): print 'Creating %s.bcp...' % (table) cacheBCP = open(outDir + '/%s.bcp' % (table), 'w') results = db.sql('''select t._Term_key, n._DAG_key, t.term, a.accID, d.abbreviation from VOC_Term t, ACC_Accession a, ACC_LogicalDB ldb, VOC_VocabDAG vd, DAG_Node n, DAG_DAG d where t._Vocab_key = 4 and t._Term_key = a._Object_key and a._MGIType_key = 13 and a.preferred = 1 and ldb._logicaldb_key = a._logicaldb_key and ldb.name = \'GO\' and t._Vocab_key = vd._Vocab_key and t._Term_key = n._Object_key and vd._DAG_key = n._DAG_key and n._DAG_key = d._DAG_key ''', 'auto') cacheKey = 1 for r in results: cacheBCP.write(mgi_utils.prvalue(cacheKey) + COLDL + \ mgi_utils.prvalue(r['_Term_key']) + COLDL + \ mgi_utils.prvalue(r['_DAG_key']) + COLDL + \ mgi_utils.prvalue(r['abbreviation']) + COLDL + \ mgi_utils.prvalue(r['accID']) + COLDL + \ mgi_utils.prvalue(r['term']) + LINEDL) cacheKey = cacheKey + 1 cacheBCP.close()
def qtl3(): fp3.write('QTL markers that have no mapping but have allele associated (no/yes)\n') fp3.write('mapping = no/alleles = yes\n') results = db.sql('''%s and not exists (select 1 from MRK_Notes n where m._Marker_key = n._Marker_key) and not exists (select 1 from MLD_Expt_Marker mld where m._Marker_key = mld._Marker_key) and exists (select 1 from ALL_Allele al where m._Marker_key = al._Marker_key and al.isWildType = 0) order by symbol ''' % (query1), 'auto') for r in results: fp3.write(r['mgiID'] + TAB) fp3.write(mgi_utils.prvalue(r['refID']) + TAB) fp3.write(r['symbol'] + TAB) fp3.write(r['name'] + TAB) fp3.write(CRT) fp3.write(CRT + '(%d rows affected)' % (len(results)) + CRT)
def qtl5(): fp5.write('QTL markers that are Reserved\n') results = db.sql(''' select m._Marker_key, to_char(m.creation_date, 'MM/dd/yyyy') as creation_date, t.status, a1.accID as mgiID, r.jnumID as refID, m.symbol, m.name from ACC_Accession a1, MRK_Status t, MRK_Marker m, MRK_History h, BIB_Citation_Cache r where m._Marker_Type_key = 6 and m._Marker_Status_key = 3 and m._Marker_key = a1._Object_key and a1._MGIType_key = 2 and a1._Logicaldb_key = 1 and a1.prefixpart = 'MGI:' and a1.preferred = 1 and m._Marker_Status_key = t._Marker_Status_key and m._Marker_key = h._Marker_key and h.sequenceNum = 1 and h._Refs_key = r._Refs_key order by m.creation_date ''', 'auto') for r in results: fp5.write(r['creation_date'] + TAB) fp5.write(r['status'] + TAB) fp5.write(r['mgiID'] + TAB) fp5.write(mgi_utils.prvalue(r['refID']) + TAB) fp5.write(r['symbol'] + TAB) fp5.write(r['name'] + TAB) fp5.write(CRT) fp5.write(CRT + '(%d rows affected)' % (len(results)) + CRT)
def marker(): # # marker # fp.write('#\n# Marker\n#\n') results = db.sql(''' select symbol, name, chromosome, cytogeneticOffset, cdate = convert(char(10), creation_date, 101) from MRK_Marker where _Marker_key = %s ''' % (markerKey), 'auto') for r in results: fp.write(string.ljust(r['symbol'],30) + TAB) fp.write(string.ljust(r['name'],30) + TAB) fp.write(string.ljust(r['chromosome'],5) + TAB) fp.write(string.ljust(mgi_utils.prvalue(r['cytogeneticOffset']),5) + TAB) fp.write(string.ljust(r['cdate'],15) + CRT)
def processImagePaneFile(): global imagePix, paneKey lineNum = 0 # For each line in the input file for line in inPaneFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: pixID = tokens[0] paneLabel = tokens[1] paneWidth = tokens[2] paneHeight = tokens[3] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) paneX = 0 paneY = 0 outPaneFile.write(str(paneKey) + TAB + \ str(imagePix[pixID]) + TAB + \ mgi_utils.prvalue(paneLabel) + TAB + \ str(paneX) + TAB + \ str(paneY) + TAB + \ str(paneWidth) + TAB + \ str(paneHeight) + TAB + \ loaddate + TAB + loaddate + CRT) paneKey = paneKey + 1 # end of "for line in inPaneFile.readlines():" return lineNum
def writeRecord(results, labelStatusKey, priority, labelType, labelTypeName): for r in results: if labelTypeName is None: labelTypeName = r['labelTypeName'] outBCP.write(mgi_utils.prvalue(r['_Allele_key']) + COLDL + \ mgi_utils.prvalue(labelStatusKey) + COLDL + \ mgi_utils.prvalue(priority) + COLDL + \ mgi_utils.prvalue(r['label']) + COLDL + \ mgi_utils.prvalue(labelType) + COLDL + \ mgi_utils.prvalue(labelTypeName) + COLDL + \ loaddate + COLDL + \ loaddate + LINEDL) print 'processed (%d) records...%s' % (len(results), mgi_utils.date())
if providers.has_key(key): provider = providers[key] else: provider = noneDisplay fp.write(r['accID'] + TAB) fp.write(r['markerType'] + TAB) fp.write(featureType + TAB) fp.write(r['symbol'] + TAB) fp.write(r['name'] + TAB) # prefer to display genomic chromosome (associated with coordinates) # rather than genetic chromosome (associated with cM / cytoband) if r['genomicChromosome']: fp.write(r['genomicChromosome'] + TAB) else: fp.write(r['chromosome'] + TAB) fp.write(str(r['startCoordinate']) + TAB) fp.write(str(r['endCoordinate']) + TAB) fp.write(mgi_utils.prvalue(r['strand']) + TAB) fp.write(genomeBuild + TAB) fp.write(provider + TAB) fp.write(r['displayName'] + TAB) fp.write(CRT) reportlib.finish_nonps(fp) db.useOneConnection(0)
for r in results: key = r['_marker_key'] # if the marker's feature type is not # 'mutation defined region', key=11928467 write out to the report # default feature type fTypes = '' if featureTypes.has_key(key): mcvKeyList = featureTypeByKey[key] if 11928467 in mcvKeyList: continue else: fTypes = (string.join(featureTypes[key],'|')) fp1.write(mgi_utils.prvalue(r['accid']) + TAB) if r['genomicChromosome']: fp1.write(r['genomicChromosome'] + TAB) else: fp1.write(r['chromosome'] + TAB) fp1.write(r['cmposition'] + TAB) if coords.has_key(key): fp1.write(mgi_utils.prvalue(coords[key][0]['startC']) + TAB) fp1.write(mgi_utils.prvalue(coords[key][0]['endC']) + TAB) fp1.write(mgi_utils.prvalue(coords[key][0]['strand']) + TAB) else: fp1.write(TAB + TAB + TAB)
results = db.sql(''' select m._Marker_key, a.accID from markers m, ACC_Accession a where m._Marker_key = a._Object_key and a._MGIType_key = 2 and a._LogicalDB_key = 9 order by m._Marker_key, a.accID ''', 'auto') seqIDs = {} for r in results: if not seqIDs.has_key(r['_Marker_key']): seqIDs[r['_Marker_key']] = [] seqIDs[r['_Marker_key']].append(r['accID']) results = db.sql('select * from markers order by symbol, mgiID', 'auto') for r in results: fp.write(mgi_utils.prvalue(r['symbol']) + reportlib.TAB + \ mgi_utils.prvalue(r['name']) + reportlib.TAB + \ mgi_utils.prvalue(r['mgiID']) + reportlib.TAB + \ mgi_utils.prvalue(r['chromosome']) + reportlib.TAB) if seqIDs.has_key(r['_Marker_key']): fp.write(string.join(seqIDs[r['_Marker_key']], ' ')) fp.write(reportlib.CRT) reportlib.finish_nonps(fp)
def processFile(): # Purpose: Read the input file, resolve values to keys. Create bcp files # Returns: 1 if error, else 0 # Assumes: file descriptors have been initialized # Effects: exits if the line does not have 15 columns # Throws: Nothing global alleleKey, refAssocKey, accKey, noteKey, mgiKey, annotKey global alleleLookup, alleleMutationKey lineNum = 0 # For each line in the input file for line in fpInputFile.readlines(): error = 0 lineNum = lineNum + 1 print('%s: %s' % (lineNum, line)) # Split the line into tokens tokens = line[:-1].split('\t') try: markerID = tokens[0] markerSymbol = tokens[1] mutationType = tokens[2] # IMPC allele type description = tokens[3] colonyID = tokens[4] strainOfOrigin = tokens[5] alleleSymbol = tokens[6] alleleName = tokens[7] inheritanceMode = tokens[8] alleleType = tokens[9] # IMPC allele class alleleSubType = tokens[10] alleleStatus = tokens[11] transmission = tokens[12] collection = tokens[13] jNum = tokens[14] createdBy = tokens[15] except: print('exiting with invalid line') exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) print('validating data and getting keys') # marker key markerKey = loadlib.verifyMarker(markerID, lineNum, fpErrorFile) # _vocab_key = 36 (Allele Molecular Mutation) mutationList = str.split(mutationType, ';') if len(mutationList) > 1: print('mutationList: %s' % mutationList) mutationKeyList = [] for m in mutationList: mutationKey = loadlib.verifyTerm('', 36, m, lineNum, fpErrorFile) if mutationKey != 0: mutationKeyList.append(mutationKey) if len(mutationKeyList) > 1: print('mutationKeyList: %s' % mutationKeyList) # strains strainOfOriginKey = sourceloadlib.verifyStrain(strainOfOrigin, lineNum, fpErrorFile) # _vocab_key = 35 (Allele Inheritance Mode) inheritanceModeKey = loadlib.verifyTerm('', 35, inheritanceMode, lineNum, fpErrorFile) # _vocab_key = 38 (Allele Type) alleleTypeKey = loadlib.verifyTerm('', 38, alleleType, lineNum, fpErrorFile) # _vocab_key = 93 (Allele Subtype) subTypeList = str.split(alleleSubType, ';') if len(subTypeList) > 1: print('subTypeList: %s' % subTypeList) subTypeKeyList = [] for s in subTypeList: if s != '': # if we have a subtype, get it's key subTypeKey = loadlib.verifyTerm('', 93, s, lineNum, fpErrorFile) if subTypeKey != 0: subTypeKeyList.append(subTypeKey) if len(subTypeKeyList) > 1: print('subTypeKeyList: %s' % subTypeKeyList) # _vocab_key = 37 (Allele Status) alleleStatusKey = loadlib.verifyTerm('', 37, alleleStatus, lineNum, fpErrorFile) # _vocab_key = 61 (Allele Transmission) transmissionKey = loadlib.verifyTerm('', 61, transmission, lineNum, fpErrorFile) # _vocab_key = 92 collectionKey = loadlib.verifyTerm('', 92, collection, lineNum, fpErrorFile) # _vocab_key = 73 (Marker-Allele Association Status) # _term_key = 4268545 (Curated) markerStatusKey = 4268545 # reference refKey = loadlib.verifyReference(jNum, lineNum, fpErrorFile) # creator createdByKey = loadlib.verifyUser(createdBy, lineNum, fpErrorFile) if createdByKey == 0: continue print('checking for missing data') # if errors, continue to next record # errors are stored (via loadlib) in the .error log if markerKey == 0 \ or mutationKeyList == [] \ or strainOfOriginKey == 0 \ or inheritanceModeKey == 0 \ or alleleTypeKey == 0 \ or alleleStatusKey == 0 \ or transmissionKey == 0 \ or collectionKey == 0 \ or refKey == 0 \ or createdByKey == 0: print('missing data, skipping this line') continue # if no errors, process the allele print('writing to allele file') # allele (isWildType = 0) fpAlleleFile.write('%d|%s|%s|%s|%s|%s|%s|%s|%s|%s|0|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (alleleKey, markerKey, strainOfOriginKey, inheritanceModeKey, alleleTypeKey, \ alleleStatusKey, transmissionKey, collectionKey, alleleSymbol, alleleName, \ isExtinct, isMixed, refKey, markerStatusKey, \ createdByKey, createdByKey, createdByKey, loaddate, loaddate, loaddate)) # molecular mutation for mutationKey in mutationKeyList: fpMutationFile.write('%s|%s|%s|%s|%s\n' \ % (alleleMutationKey, alleleKey, mutationKey, loaddate, loaddate)) alleleMutationKey += 1 # reference associations # Original fpRefFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (refAssocKey, refKey, alleleKey, mgiTypeKey, origRefTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) refAssocKey = refAssocKey + 1 # Molecular fpRefFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (refAssocKey, refKey, alleleKey, mgiTypeKey, molRefTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) refAssocKey = refAssocKey + 1 # allele subtype for subTypeKey in subTypeKeyList: fpAnnotFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (annotKey, annotTypeKey, alleleKey, subTypeKey, \ qualifierKey, loaddate, loaddate)) annotKey = annotKey + 1 # MGI Accession ID for the allele alleleID = '%s%s' % (mgiPrefix, mgiKey) fpAccFile.write('%s|%s|%s|%s|1|%d|%d|0|1|%s|%s|%s|%s\n' \ % (accKey, alleleID, mgiPrefix, mgiKey, alleleKey, mgiTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) # storing data in MGI_Note # molecular note fpNoteFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiTypeKey, molecularNoteTypeKey, description,\ createdByKey, createdByKey, loaddate, loaddate)) noteKey = noteKey + 1 # colony ID note fpNoteFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiTypeKey, colonyIdNoteTypeKey, colonyID, \ createdByKey, createdByKey, loaddate, loaddate)) noteKey = noteKey + 1 # Print out a new text file and attach the new MGI Allele IDs # as the last field fpNewAlleleRptFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (mgi_utils.prvalue(alleleID), \ mgi_utils.prvalue(alleleSymbol), \ mgi_utils.prvalue(alleleName), \ mgi_utils.prvalue(markerID), \ mgi_utils.prvalue(markerSymbol), \ mgi_utils.prvalue(colonyID))) accKey = accKey + 1 mgiKey = mgiKey + 1 alleleKey = alleleKey + 1 # # Update the AccessionMax value # print('DEBUG: %s' % DEBUG) if DEBUG == 'false': db.sql('select * from ACC_setMax(%d)' % (lineNum), None) db.commit() return 0
def processFile(): ''' # requires: # # effects: # Reads input file # Writes output file # # returns: # nothing # ''' # For each line in the input file for line in inputFile.readlines(): if line[0] == '!': continue tokens = string.split(line[:-1], delim) try: if parseType == 'Library': badName = tokens[0] goodName = tokens[2] else: badName = tokens[1] goodName = tokens[2] except: errorFile.write('Invalid line: %s\n' % (line)) continue if parseType == 'Tissues': results = db.sql( 'select _Tissue_key from PRB_Tissue where tissue = "%s"' % (goodName), 'auto') elif parseType == 'Cell': results = db.sql( 'select term from VOC_Term where term = "%s"' % (goodName), 'auto') elif parseType == 'Library': results = db.sql( 'select _Source_key from PRB_Source where name = "%s"' % (goodName), 'auto') elif parseType == 'Strains': results = db.sql('select a.accID from PRB_Strain_Acc_View a, PRB_Strain s ' + \ 'where s.strain = "%s" ' % (goodName) + \ 'and s._Strain_key *= a._Object_key ' + \ 'and a._LogicalDB_key = 1 ' + \ 'and a.prefixPart = "MGI:" ' + \ 'and a.preferred = 1', 'auto') if len(results) > 0 and badName != goodName: if parseType == 'strain': outputFile.write( mgi_utils.prvalue(results[0]['accID']) + delim + goodName + delim + badName + delim + createdBy + '\n') else: outputFile.write(delim + goodName + delim + badName + delim + createdBy + '\n') elif len(results) == 0: errorFile.write('Invalid good name: %s\n' % (goodName))
# column 7 if genomicToTranscript.has_key(genomicID): fp.write(string.join(genomicToTranscript[genomicID], ' ')) fp.write(TAB) # column 8 if genomicToProtein.has_key(genomicID): fp.write(string.join(genomicToProtein[genomicID], ' ')) fp.write(TAB) # column 9: feature types if featureTypes.has_key(r['_Marker_key']): fp.write(string.join(featureTypes[r['_Marker_key']], '|')) fp.write(TAB) # column 10-11-12 if coords.has_key(key): fp.write(mgi_utils.prvalue(coords[r['_Marker_key']][0]['startC']) + TAB) fp.write(mgi_utils.prvalue(coords[r['_Marker_key']][0]['endC']) + TAB) fp.write(mgi_utils.prvalue(coords[r['_Marker_key']][0]['strand']) + TAB) else: fp.write(TAB + TAB + TAB) # column 13: biotypes if bioTypes.has_key(r['_Marker_key']): fp.write(string.join(bioTypes[r['_Marker_key']], '|')) fp.write(CRT) reportlib.finish_nonps(fp)
def processFile(): global primerKey, refKey, aliasKey, accKey, mgiKey lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: markerSymbol = tokens[0] # not used markerIDs = string.split(tokens[1], '|') name = tokens[2] jnum = tokens[3] regionCovered = tokens[4] sequence1 = tokens[5] sequence2 = tokens[6] productSize = tokens[7] notes = tokens[8] sequenceIDs = tokens[9] aliasList = string.split(tokens[10], '|') createdBy = tokens[11] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) # marker IDs markerList = [] for markerID in markerIDs: markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) if len(markerID) > 0 and markerKey == 0: errorFile.write('Invalid Marker: %s, %s\n' % (name, markerID)) error = 1 elif len(markerID) > 0: markerList.append(markerKey) referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile) createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) # sequence IDs seqAccList = string.split(sequenceIDs, '|') # if errors, continue to next record if error: continue # if no errors, process the primer primerFile.write('%d\t%s\t\t%d\t%d\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t%s\t%s\n' \ % (primerKey, name, NA, vectorKey, segmentTypeKey, mgi_utils.prvalue(sequence1), \ mgi_utils.prvalue(sequence2), mgi_utils.prvalue(regionCovered), mgi_utils.prvalue(productSize), \ createdByKey, createdByKey, loaddate, loaddate)) for markerKey in markerList: if markerList.count(markerKey) == 1: markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \ % (primerKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate)) else: errorFile.write('Invalid Marker Duplicate: %s, %s\n' % (name, markerID)) # loaddate)) refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' % (refKey, primerKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) # aliases for alias in aliasList: if len(alias) == 0: continue aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate)) aliasKey = aliasKey + 1 # MGI Accession ID for the marker accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) newPrimerFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \ % (markerSymbol, string.join(markerIDs, '|'), name, jnum, regionCovered, sequence1, sequence2, productSize, notes, sequenceIDs, createdBy, mgiPrefix, mgiKey)) accKey = accKey + 1 mgiKey = mgiKey + 1 # sequence accession ids for acc in seqAccList: if len(acc) == 0: continue prefixPart, numericPart = accessionlib.split_accnum(acc) accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, acc, prefixPart, numericPart, logicalDBKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) accKey = accKey + 1 # notes if len(notes) > 0: noteFile.write('%s|1\t%s\t%s\t%s\n' \ % (primerKey, notes, loaddate, loaddate)) refKey = refKey + 1 primerKey = primerKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
def processFileIKMC(createMCL, createNote, setStatus, \ symbol, ikmcSymbol, mutantCellLine, ikmcNotes, createdByKey, existingAlleleID): global noteKey, ikmcSQLs # # add new MCLs to new/existing alleles # if len(createMCL) > 0: if DEBUG: print symbol, createMCL if int(createMCL) == 0: aKey = alleleLookup[symbol][0][0] else: aKey = createMCL addMutantCellLine(aKey, mutantCellLine, createdByKey) # # set allele/status = Approved for existing "reserved" alleles # if len(setStatus) > 0: ikmcSQLs.append('update ALL_Allele set _Allele_Status_key = 847114 where _Allele_key = %s' % (setStatus)) # # Add IKMC Colony/Note to a new or existing allele # # child exists/ikmc note exists : update existing note # || => _Note_key||existing colony notes # # child exists/ikmc note does not exis : add note # :: => allele/child key # # new allele/child/non-duplicate IKMC Colony # 0::colony(s) # # blank => do nothing # if len(createNote) > 0: if DEBUG: print 'createNote: ', symbol try: tokens = createNote.split('::') aKey = tokens[0] # duplicate child, additional note : add note to new child if int(aKey) == 0: nKey = alleleLookup[symbol][0][1] note = tokens[1] ikmcSQLs.append('''update MGI_NoteChunk set note = '%s' where _Note_key = %s;''' % (note, nKey)) # child exists, note does not exist : add note to existing child else: aKey = tokens[0] note = ikmcNotes if alleleLookup.has_key(symbol): nKey = alleleLookup[symbol][0][1] ikmcSQLs.append('''update MGI_NoteChunk set note = rtrim(note) || '|%s' where _Note_key = %s;''' % (note, nKey)) else: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, aKey, mgiNoteObjectKey, mgiIKMCNoteTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) noteChunkFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, 1, note, createdByKey, createdByKey, loaddate, loaddate)) # save symbol/aKey/ikmc note key/allele id alleleLookup[symbol] = [] alleleLookup[symbol].append((aKey, noteKey, 'missing allele id (1)')) noteKey = noteKey + 1 # child exists, note exists : update existing note except: if DEBUG: print createNote tokens = createNote.split('||') nKey = tokens[0] note = tokens[1] + '|' + ikmcNotes ikmcSQLs.append('''update MGI_NoteChunk set note = '%s' where _Note_key = %s;''' % (note, nKey)) # # print out the proper allele id # if len(existingAlleleID) > 0: printAlleleID = existingAlleleID elif alleleLookup.has_key(symbol): printAlleleID = alleleLookup[symbol][0][2] else: printAlleleID = 'missing allele id (2)' newAlleleFile.write('%s\t%s\t%s\n' \ % (mgi_utils.prvalue(ikmcNotes), \ mgi_utils.prvalue(printAlleleID), \ mgi_utils.prvalue(ikmcSymbol)))
def processFile(): global alleleKey, refAssocKey, accKey, noteKey, mgiKey, annotKey, mutationKey global alleleLookup lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = line[:-1].split('\t') #print line try: markerID = tokens[0] symbol = tokens[1] name = tokens[2] alleleStatus = tokens[3] alleleType = tokens[4] alleleSubtypes = tokens[5] collectionKey = tokens[6] germLine = tokens[7] references = tokens[8] strainOfOrigin = tokens[9] mutantCellLine = tokens[10] molecularNotes = tokens[11] driverNotes = tokens[12] ikmcNotes = tokens[13] mutations = tokens[14] inheritanceMode = tokens[15] isMixed = tokens[16] isExtinct = tokens[17] createdBy = tokens[18] createMCL = tokens[19] createNote = tokens[20] setStatus = tokens[21] existingAlleleID = tokens[22] ikmcSymbol = tokens[23] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) # creator createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) if createdByKey == 0: continue # processing for IKMC-only if len(createMCL) > 0 or len(createNote) > 0 or len(setStatus) > 0: processFileIKMC(createMCL, createNote, setStatus, \ symbol, ikmcSymbol, mutantCellLine, ikmcNotes, \ createdByKey, existingAlleleID) continue # marker key markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) # hard-coded # _vocab_key = 73 (Marker-Allele Association Status) # _term_key = 4268545 (Curated) markerStatusKey = 4268545 # _vocab_key = 37 (Allele Status) alleleStatusKey = loadlib.verifyTerm('', 37, alleleStatus, lineNum, errorFile) # _vocab_key = 38 (Allele Type) alleleTypeKey = loadlib.verifyTerm('', 38, alleleType, lineNum, errorFile) # _vocab_key = 61 (Allele Transmission) germLineKey = loadlib.verifyTerm('', 61, germLine, lineNum, errorFile) # _vocab_key = 36 (Allele Molecular Mutation) allMutations = mutations.split('|') # _vocab_key = 35 (Allele Status) inheritanceModeKey = loadlib.verifyTerm('', 35, inheritanceMode, lineNum, errorFile) # strains strainOfOriginKey = sourceloadlib.verifyStrain(strainOfOrigin, lineNum, errorFile) # reference refKey = loadlib.verifyReference(jnum, lineNum, errorFile) # if errors, continue to next record # errors are stored (via loadlib) in the .error log if markerKey == 0 \ or markerStatusKey == 0 \ or alleleStatusKey == 0 \ or alleleTypeKey == 0 \ or germLineKey == 0 \ or allMutations == 0 \ or inheritanceModeKey == 0 \ or strainOfOriginKey == 0 \ or refKey == 0 \ or createdByKey == 0: continue # if no errors, process the allele # not specified/testing #collectionKey = 11025586 # allele (master) alleleFile.write('%d|%s|%s|%s|%s|%s|%s|%s|%s|%s|0|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (alleleKey, markerKey, strainOfOriginKey, inheritanceModeKey, alleleTypeKey, \ alleleStatusKey, germLineKey, collectionKey, symbol, name, \ isExtinct, isMixed, refKey, markerStatusKey, \ createdByKey, createdByKey, createdByKey, loaddate, loaddate, loaddate)) # molecular mutation for mutation in allMutations: mutationTermKey = loadlib.verifyTerm('', 36, mutation, lineNum, errorFile) mutationFile.write('%s|%s|%s|%s|%s\n' \ % (mutationKey, alleleKey, mutationTermKey, loaddate, loaddate)) mutationKey = mutationKey + 1 # # allele references # allReferences = references.split('||') for reference in allReferences: refType, refID = reference.split('|') refKey = loadlib.verifyReference(refID, lineNum, errorFile) if refType == 'Original': refAssocTypeKey = 1011 elif refType == 'Transmission': refAssocTypeKey = 1023 elif refType == 'Molecular': refAssocTypeKey = 1012 refFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (refAssocKey, refKey, alleleKey, mgiTypeKey, refAssocTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) refAssocKey = refAssocKey + 1 # # allele subtypes # allSubtypes = alleleSubtypes.split('|') for s in allSubtypes: # _vocab_key = 93 (Allele Subtype) alleleSubtypeKey = loadlib.verifyTerm('', 93, s, lineNum, errorFile) annotFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (annotKey, annotTypeKey, alleleKey, alleleSubtypeKey, \ qualifierKey, loaddate, loaddate)) annotKey = annotKey + 1 # # mutant cell line # if len(mutantCellLine) > 0: addMutantCellLine(alleleKey, mutantCellLine, createdByKey) # MGI Accession ID for the allelearker accFile.write('%s|%s%d|%s|%s|1|%d|%d|0|1|%s|%s|%s|%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, alleleKey, mgiTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) # storing data in MGI_Note # molecular notes mgiNoteSeqNum = 1 if len(molecularNotes) > 0: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiNoteObjectKey, mgiMolecularNoteTypeKey, \ molecularNotes, createdByKey, createdByKey, loaddate, loaddate)) noteKey = noteKey + 1 # driver notes # TR12662/MGI_Relationship._Category_key = 1006 # removed noteFile code # place hodler for MGI_Relationship code # the IKMC is the only product using this and IKMC does not add any driver note #mgiNoteSeqNum = 1 #if len(driverNotes) > 0: # ikmc notes useIKMCnotekey = 0 if len(ikmcNotes) > 0: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiNoteObjectKey, mgiIKMCNoteTypeKey, \ ikmcNotes, createdByKey, createdByKey, loaddate, loaddate)) useIKMCnotekey = noteKey noteKey = noteKey + 1 # Print out a new text file and attach the new MGI Allele IDs as the last field if createdBy == 'ikmc_alleleload': newAlleleFile.write('%s\t%s%s\t%s\n' \ % (mgi_utils.prvalue(ikmcNotes), \ mgi_utils.prvalue(mgiPrefix), mgi_utils.prvalue(mgiKey), \ mgi_utils.prvalue(ikmcSymbol))) else: newAlleleFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s\n' \ % (mgi_utils.prvalue(markerID), \ mgi_utils.prvalue(symbol), \ mgi_utils.prvalue(name), \ mgi_utils.prvalue(alleleStatus), \ mgi_utils.prvalue(alleleType), \ mgi_utils.prvalue(alleleSubtype), \ mgi_utils.prvalue(collection), \ mgi_utils.prvalue(germLine), \ mgi_utils.prvalue(references), \ mgi_utils.prvalue(strainOfOrigin), \ mgi_utils.prvalue(mutantCellLine), \ mgi_utils.prvalue(allMutations), \ mgi_utils.prvalue(inheritanceMode), \ mgi_utils.prvalue(isMixed), \ mgi_utils.prvalue(isExtinct), \ mgi_utils.prvalue(refKey), \ mgi_utils.prvalue(markerStatusKey), \ mgi_utils.prvalue(createdBy), \ mgi_utils.prvalue(mgiPrefix), mgi_utils.prvalue(mgiKey))) # save symbol/alleleKey/ikmc note key alleleLookup[symbol] = [] alleleLookup[symbol].append( (alleleKey, useIKMCnotekey, mgiPrefix + str(mgiKey))) accKey = accKey + 1 mgiKey = mgiKey + 1 alleleKey = alleleKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax(%d)' % (lineNum), None) db.commit()
and ma._LogicalDB_key = 1 and ma.preferred = 1 order by p.accID ''', 'auto') prevProbe = 0 markers = [] for r in results: if prevProbe != r['_Probe_key']: if len(markers) > 0: fp.write(string.join(markers, ',')) markers = '' if prevProbe > 0: fp.write(reportlib.CRT) fp.write(mgi_utils.prvalue(r['accID']) + reportlib.TAB) fp.write(mgi_utils.prvalue(r['name']) + reportlib.TAB) prevProbe = r['_Probe_key'] markers = [] markers.append(r['markerID']) fp.write(string.join(markers, ',')) fp.write(reportlib.CRT) reportlib.finish_nonps(fp)
def process(mode): # Purpose: process data using either 'sql' or 'bcp' mode db.sql('create index idx1 on toprocess1(_Allele_key)', None) db.sql('create index idx2 on toprocess2(_Allele_key)', None) if mode == 'bcp': outBCP = open(os.environ['ALLCACHEBCPDIR'] + '/ALL_Cre_Cache.bcp', 'w') else: db.sql(deleteSQL, None) db.commit() # # next available primary key # if mode == 'sql': results = db.sql('select max(_Cache_key) as cacheKey from ALL_Cre_Cache', 'auto') for r in results: nextMaxKey = r['cacheKey'] if nextMaxKey == None: nextMaxKey = 0 else: nextMaxKey = 0 nextMaxKey = nextMaxKey + 1 results = db.sql('select * from toprocess1', 'auto') for r in results: creSystemsList = processCreSystems(r['_EMAPA_Term_key'], r['emapaTerm'], r['_Stage_key']) if mode == 'sql': for printCreLabel in creSystemsList: db.sql(insertSQL1 % (str(nextMaxKey), r['_Allele_key'], r['_Allele_Type_key'], r['_EMAPA_Term_key'], r['_Stage_key'], r['_Assay_key'], r['accID'], r['symbol'], r['name'], r['alleleType'], r['driverGene'], r['emapaTerm'], r['age'], r['ageMin'], r['ageMax'], r['expressed'], r['hasImage'], printCreLabel, userKey, userKey), None) nextMaxKey = nextMaxKey + 1 else: for printCreLabel in creSystemsList: outBCP.write(str(nextMaxKey) + COLDL + mgi_utils.prvalue(r['_Allele_key']) + COLDL + mgi_utils.prvalue(r['_Allele_Type_key']) + COLDL + mgi_utils.prvalue(r['_EMAPA_Term_key']) + COLDL + mgi_utils.prvalue(r['_Stage_key']) + COLDL + mgi_utils.prvalue(r['_Assay_key']) + COLDL + mgi_utils.prvalue(r['accID']) + COLDL + mgi_utils.prvalue(r['symbol']) + COLDL + mgi_utils.prvalue(r['name']) + COLDL + mgi_utils.prvalue(r['alleleType']) + COLDL + mgi_utils.prvalue(r['driverGene']) + COLDL + mgi_utils.prvalue(r['emapaTerm']) + COLDL + mgi_utils.prvalue(r['age']) + COLDL + mgi_utils.prvalue(r['ageMin']) + COLDL + mgi_utils.prvalue(r['ageMax']) + COLDL + mgi_utils.prvalue(r['expressed']) + COLDL + mgi_utils.prvalue(r['hasImage']) + COLDL + mgi_utils.prvalue(printCreLabel) + COLDL + mgi_utils.prvalue(userKey) + COLDL + mgi_utils.prvalue(userKey) + COLDL + loaddate + COLDL + loaddate + LINEDL) nextMaxKey = nextMaxKey + 1 # # select the remaining Cre data (those alleles without genotypes/structures) # cre-system is always empty (null) # if isQuerySQL2 == 1: results = db.sql('select * from toprocess2', 'auto') for r in results: nextMaxKey = nextMaxKey + 1 if mode == 'sql': db.sql(insertSQL2 % (str(nextMaxKey) , r['_Allele_key'], r['_Allele_Type_key'], r['accID'], r['symbol'], r['name'], r['alleleType'], r['driverGene'], userKey, userKey), None) else: outBCP.write(str(nextMaxKey) + COLDL + mgi_utils.prvalue(r['_Allele_key']) + COLDL + mgi_utils.prvalue(r['_Allele_Type_key']) + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue(r['accID']) + COLDL + mgi_utils.prvalue(r['symbol']) + COLDL + mgi_utils.prvalue(r['name']) + COLDL + mgi_utils.prvalue(r['alleleType']) + COLDL + mgi_utils.prvalue(r['driverGene']) + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue('') + COLDL + mgi_utils.prvalue(userKey) + COLDL + mgi_utils.prvalue(userKey) + COLDL + loaddate + COLDL + loaddate + LINEDL) if mode == 'bcp': outBCP.close()
def processFileIKMC(createMCL, createNote, setStatus, \ symbol, ikmcSymbol, mutantCellLine, ikmcNotes, createdByKey, existingAlleleID): global noteKey, ikmcSQLs # # add new MCLs to new/existing alleles # if len(createMCL) > 0: if DEBUG: print(symbol, createMCL) if int(createMCL) == 0: aKey = alleleLookup[symbol][0][0] else: aKey = createMCL addMutantCellLine(aKey, mutantCellLine, createdByKey) # # set allele/status = Approved for existing "reserved" alleles # if len(setStatus) > 0: ikmcSQLs.append( 'update ALL_Allele set _Allele_Status_key = 847114 where _Allele_key = %s' % (setStatus)) # # Add IKMC Colony/Note to a new or existing allele # # child exists/ikmc note exists : update existing note # || => _Note_key||existing colony notes # # child exists/ikmc note does not exis : add note # :: => allele/child key # # new allele/child/non-duplicate IKMC Colony # 0::colony(s) # # blank => do nothing # if len(createNote) > 0: if DEBUG: print('createNote: ', symbol) try: tokens = createNote.split('::') aKey = tokens[0] # duplicate child, additional note : add note to new child if int(aKey) == 0: nKey = alleleLookup[symbol][0][1] note = tokens[1] ikmcSQLs.append( '''update MGI_Note set note = '%s' where _Note_key = %s;''' % (note, nKey)) # child exists, note does not exist : add note to existing child else: aKey = tokens[0] note = ikmcNotes if symbol in alleleLookup: nKey = alleleLookup[symbol][0][1] ikmcSQLs.append( '''update MGI_Note set note = rtrim(note) || '|%s' where _Note_key = %s;''' % (note, nKey)) else: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, aKey, mgiNoteObjectKey, mgiIKMCNoteTypeKey, \ note, createdByKey, createdByKey, loaddate, loaddate)) # save symbol/aKey/ikmc note key/allele id alleleLookup[symbol] = [] alleleLookup[symbol].append( (aKey, noteKey, 'missing allele id (1)')) noteKey = noteKey + 1 # child exists, note exists : update existing note except: if DEBUG: print(createNote) tokens = createNote.split('||') nKey = tokens[0] note = tokens[1] + '|' + ikmcNotes ikmcSQLs.append( '''update MGI_Note set note = '%s' where _Note_key = %s;''' % (note, nKey)) # # print out the proper allele id # if len(existingAlleleID) > 0: printAlleleID = existingAlleleID elif symbol in alleleLookup: printAlleleID = alleleLookup[symbol][0][2] else: printAlleleID = 'missing allele id (2)' newAlleleFile.write('%s\t%s\t%s\n' \ % (mgi_utils.prvalue(ikmcNotes), \ mgi_utils.prvalue(printAlleleID), \ mgi_utils.prvalue(ikmcSymbol)))
key = r['mgiID'] value = r['symbol'] # print one row of the marker record if not markerList.has_key(key): fp.write(r['mgiID'] + TAB) fp.write(r['symbol'] + TAB) fp.write('MGI' + TAB) fp.write(r['mgiID'] + TAB) fp.write(r['featureType'] + TAB) fp.write(CRT) markerList[key] = value # print row of the gene model sequence fp.write(r['mgiID'] + TAB) fp.write(r['symbol'] + TAB) fp.write(r['provider'] + TAB) fp.write(r['accID'] + TAB) fp.write(mgi_utils.prvalue(r['rawbiotype']) + TAB) if r['_Qualifier_key'] == 615419: fp.write('Representative') fp.write(CRT) fp.write(CRT + '(%d genes affected)' % (len(markerList)) + CRT) db.useOneConnection(0) reportlib.finish_nonps(fp) # non-postscript file
def doGAFFinish(): # # Output format: # # The GO format has the following columns: # # 1. Database designation (MGI) # 2. MGI Marker ID (MGI:xxxx) # 3. Symbol # 4. Qualifier # 5. GO id # 6. MGI ID of Reference (MGI:MGI:xxxx|PMID:xxxx) # 7. Evidence abbreviation # 8. Inferred From # 9. GO DAG Abbreviation (F, P, C) # 10. Gene name # 11. Gene synonym(s) - list of |-delimited synonyms # 12. Marker Type or Protein (gene) # 13. Species (taxon:10090) # 14. Modification Date (YYYYMMDD) # 15. Assigned By # 16. Properites/Values (occurs_in, part_of, etc.) # 17. Isorform # # # process results # results = db.sql('select * from gomarker2 order by symbol, termID', 'auto') for r in results: reportRow = '' if r['_Term_key'] not in dag: continue if dag[r['_Term_key']] not in dagQualifier: continue objectKey = str(r['_Object_key']) + ':' + str(r['_AnnotEvidence_key']) # columns 1-5 reportRow = MGIPREFIX + TAB reportRow = reportRow + str(r['markerID']) + TAB reportRow = reportRow + r['symbol'] + TAB if r['qualifier'] != None: qualifier = r['qualifier'].strip() else: qualifier = '' reportRow = reportRow + qualifier + TAB reportRow = reportRow + r['termID'] + TAB # column 6; reference references = [] references.append(MGIPREFIX + ':' + r['refID']) if pubMed.has_key(r['_Refs_key']): references.append('PMID:' + pubMed[r['_Refs_key']]) else: if r['_Refs_key'] in goRefDict: references.append(goRefDict[r['_Refs_key']]) reportRow = reportRow + '|'.join(references) + TAB # column 7 reportRow = reportRow + r['evidenceCode'] + TAB # column 8 inferredFrom = mgi_utils.prvalue(r['inferredFrom']).replace('MGI:', 'MGI:MGI:') reportRow = reportRow + inferredFrom + TAB # column 9-10 reportRow = reportRow + dag[r['_Term_key']] + TAB reportRow = reportRow + r['name'] + TAB # column 11 if syns.has_key(r['_Object_key']): syn_string = '|'.join(syns[r['_Object_key']]) reportRow = reportRow + syn_string + TAB else: reportRow = reportRow + TAB # column 12 # if marker is associated with an isoform (via go/annotation) # or marker is associated with a protein (via marker/sequence cache) # print 'protein' # else, print marker type (ex. 'gene') if isoformsProtein.has_key(objectKey) or proteins.has_key(r['_Object_key']): reportRow = reportRow + 'protein' + TAB else: reportRow = reportRow + r['markerType'] + TAB # column 13 reportRow = reportRow + SPECIES + TAB # column 14 reportRow = reportRow + str(r['mDate']) + TAB # column 15; assigned by # remove "GOA_"; for example: "GOA_IntAct" ==> "IntAct" # remove "NOCTUA_"; for example: "NOCTUA_MGI" ==> "MGI" if r['assignedBy'].find('NOCTUA_') >= 0: assignedBy = r['assignedBy'].replace('NOCTUA_', '') reportRow = reportRow + assignedBy + TAB elif r['assignedBy'].find('GOA_') >= 0: assignedBy = r['assignedBy'].replace('GOA_', '') reportRow = reportRow + assignedBy + TAB elif r['assignedBy'] in assignedByList1: reportRow = reportRow + 'UniProt' + TAB elif r['assignedBy'] in assignedByList2: reportRow = reportRow + r['assignedBy'] + TAB # else use default (MGIPREFIX) else: reportRow = reportRow + MGIPREFIX + TAB # # column 16 # contains property/value information # see lib_py_report/go_annot_extensions.py for list of excluded properties properties = '' if gafCol16Lookup.has_key(objectKey): properties = ''.join(gafCol16Lookup[objectKey]) reportRow = reportRow + properties + TAB # column 17 # if isoformProtein = true # then use isoformsProtein isoforms = '' if isoformsProtein.has_key(objectKey): isoforms = '|'.join(isoformsProtein[objectKey]) reportRow = reportRow + isoforms + CRT fp.write(reportRow) # # TR11060 # subset of UniProtKB:xxxx-?? only # if forPROC.has_key(objectKey): fp2.write(reportRow)
def addGPADReportRow(reportRow, r): objectKey = str(r['_Object_key']) + ':' + str(r['_AnnotEvidence_key']) key = r['_AnnotEvidence_key'] # 3. Qualifier # use gadCol3 or DAG if key in gpadCol3Lookup: default_relation_for_aspect = '|'.join(gpadCol3Lookup[key]) elif r['inferredFrom'] != None and r['inferredFrom'].find('InterPro:') >= 0 and dag[r['_Term_key']] == 'P': default_relation_for_aspect = 'involved_in' else: default_relation_for_aspect = dagQualifier[dag[r['_Term_key']]] # qualifier from MGD annotations if r['qualifier'] != None: qualifier = r['qualifier'].strip() else: qualifier = '' if qualifier == '': gap_qualifier = default_relation_for_aspect elif qualifier == 'NOT': gap_qualifier = qualifier + '|' + default_relation_for_aspect else: gap_qualifier = qualifier reportRow = reportRow + gap_qualifier + TAB # 4. GO ID reportRow = reportRow + r['termID'] + TAB # 5. DB:Reference(s) references = [] references.append(MGIPREFIX + ':' + r['refID']) if pubMed.has_key(r['_Refs_key']): references.append('PMID:' + pubMed[r['_Refs_key']]) else: if r['_Refs_key'] in goRefDict: references.append(goRefDict[r['_Refs_key']]) reportRow = reportRow + '|'.join(references) + TAB # 6. Evidence Code if key in evidenceLookup: reportRow = reportRow + evidenceLookup[key][0] elif r['evidenceCode'] in ecoLookupByEvidence: reportRow = reportRow + ecoLookupByEvidence[r['evidenceCode']] else: reportRow = reportRow + 'NOT FOUND' reportRow = reportRow + TAB # 7. With (or)From inferredFrom = mgi_utils.prvalue(r['inferredFrom']).replace('MGI:', 'MGI:MGI:') reportRow = reportRow + mgi_utils.prvalue(inferredFrom) + TAB # 8. Interacting taxon ID if key in taxonLookup: reportRow = reportRow + taxonLookup[key][0] reportRow = reportRow + TAB # 9. Date reportRow = reportRow + str(r['mDate']) + TAB # 10. Assigned by # remove "NOCTUA_"; for example: "NOCTUA_MGI" ==> "MGI" if r['assignedBy'].find('NOCTUA_') >= 0: assignedBy = r['assignedBy'].replace('NOCTUA_', '') reportRow = reportRow + assignedBy + TAB # remove "GOA_"; for example: "GOA_IntAct" ==> "IntAct" elif r['assignedBy'].find('GOA_') >= 0: assignedBy = r['assignedBy'].replace('GOA_', '') reportRow = reportRow + assignedBy + TAB elif r['assignedBy'] in assignedByList1: reportRow = reportRow + 'UniProt' + TAB elif r['assignedBy'] in assignedByList2: reportRow = reportRow + r['assignedBy'] + TAB # else use default (MGIPREFIX) else: reportRow = reportRow + MGIPREFIX + TAB # 11. Annotation Extension properties = '' if key in gpadCol11Lookup: properties = ','.join(gpadCol11Lookup[key]) elif gafCol16Lookup.has_key(objectKey): properties = ''.join(gafCol16Lookup[objectKey]) reportRow = reportRow + properties + TAB # 12. Annotation Properties properties = '' if key in gpadCol12Lookup: properties = '|'.join(gpadCol12Lookup[key]) reportRow = reportRow + properties + CRT return reportRow
def processFile(): global alleleKey, refAssocKey, accKey, noteKey, mgiKey, annotKey global alleleLookup lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = line[:-1].split('\t') #print line try: markerID = tokens[0] symbol = tokens[1] name = tokens[2] alleleStatus = tokens[3] alleleType = tokens[4] alleleSubtypes = tokens[5] collectionKey = tokens[6] germLine = tokens[7] references = tokens[8] strainOfOrigin = tokens[9] mutantCellLine = tokens[10] molecularNotes = tokens[11] driverNotes = tokens[12] ikmcNotes = tokens[13] mutations = tokens[14] inheritanceMode = tokens[15] isMixed = tokens[16] isExtinct = tokens[17] createdBy = tokens[18] createMCL = tokens[19] createNote = tokens[20] setStatus = tokens[21] existingAlleleID = tokens[22] ikmcSymbol = tokens[23] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) # creator createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) if createdByKey == 0: continue # processing for IKMC-only if len(createMCL) > 0 or len(createNote) > 0 or len(setStatus) > 0: processFileIKMC(createMCL, createNote, setStatus, \ symbol, ikmcSymbol, mutantCellLine, ikmcNotes, \ createdByKey, existingAlleleID) continue # marker key markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) # hard-coded # _vocab_key = 73 (Marker-Allele Association Status) # _term_key = 4268545 (Curated) markerStatusKey = 4268545 # _vocab_key = 37 (Allele Status) alleleStatusKey = loadlib.verifyTerm('', 37, alleleStatus, lineNum, errorFile) # _vocab_key = 38 (Allele Type) alleleTypeKey = loadlib.verifyTerm('', 38, alleleType, lineNum, errorFile) # _vocab_key = 61 (Allele Transmission) germLineKey = loadlib.verifyTerm('', 61, germLine, lineNum, errorFile) # _vocab_key = 36 (Allele Molecular Mutation) allMutations = mutations.split('|') # _vocab_key = 35 (Allele Status) inheritanceModeKey = loadlib.verifyTerm('', 35, inheritanceMode, lineNum, errorFile) # strains strainOfOriginKey = sourceloadlib.verifyStrain(strainOfOrigin, lineNum, errorFile) # reference refKey = loadlib.verifyReference(jnum, lineNum, errorFile) # if errors, continue to next record # errors are stored (via loadlib) in the .error log if markerKey == 0 \ or markerStatusKey == 0 \ or alleleStatusKey == 0 \ or alleleTypeKey == 0 \ or germLineKey == 0 \ or allMutations == 0 \ or inheritanceModeKey == 0 \ or strainOfOriginKey == 0 \ or refKey == 0 \ or createdByKey == 0: continue # if no errors, process the allele # not specified/testing #collectionKey = 11025586 # allele (master) alleleFile.write('%d|%s|%s|%s|%s|%s|%s|%s|%s|%s|0|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (alleleKey, markerKey, strainOfOriginKey, inheritanceModeKey, alleleTypeKey, \ alleleStatusKey, germLineKey, collectionKey, symbol, name, \ isExtinct, isMixed, refKey, markerStatusKey, \ createdByKey, createdByKey, createdByKey, loaddate, loaddate, loaddate)) # molecular mutation for mutation in allMutations: mutationKey = loadlib.verifyTerm('', 36, mutation, lineNum, errorFile) mutationFile.write('%s|%s|%s|%s\n' \ % (alleleKey, mutationKey, loaddate, loaddate)) # # allele references # allReferences = references.split('||') for reference in allReferences: refType, refID = reference.split('|') refKey = loadlib.verifyReference(refID, lineNum, errorFile) if refType == 'Original': refAssocTypeKey = 1011 elif refType == 'Transmission': refAssocTypeKey = 1023 elif refType == 'Molecular': refAssocTypeKey = 1012 refFile.write('%s|%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (refAssocKey, refKey, alleleKey, mgiTypeKey, refAssocTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) refAssocKey = refAssocKey + 1 # # allele subtypes # allSubtypes = alleleSubtypes.split('|') for s in allSubtypes: # _vocab_key = 93 (Allele Subtype) alleleSubtypeKey = loadlib.verifyTerm('', 93, s, lineNum, errorFile) annotFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (annotKey, annotTypeKey, alleleKey, alleleSubtypeKey, \ qualifierKey, loaddate, loaddate)) annotKey = annotKey + 1 # # mutant cell line # if len(mutantCellLine) > 0: addMutantCellLine(alleleKey, mutantCellLine, createdByKey) # MGI Accession ID for the allelearker accFile.write('%s|%s%d|%s|%s|1|%d|%d|0|1|%s|%s|%s|%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, alleleKey, mgiTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) # storing data in MGI_Note/MGI_NoteChunk # molecular notes mgiNoteSeqNum = 1 if len(molecularNotes) > 0: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiNoteObjectKey, mgiMolecularNoteTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) noteChunkFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, mgiNoteSeqNum, molecularNotes, createdByKey, createdByKey, loaddate, loaddate)) noteKey = noteKey + 1 # driver notes # TR12662/MGI_Relationship._Category_key = 1006 # removed noteFile code # place hodler for MGI_Relationship code # the IKMC is the only product using this and IKMC does not add any driver note #mgiNoteSeqNum = 1 #if len(driverNotes) > 0: # ikmc notes useIKMCnotekey = 0 if len(ikmcNotes) > 0: noteFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, alleleKey, mgiNoteObjectKey, mgiIKMCNoteTypeKey, \ createdByKey, createdByKey, loaddate, loaddate)) noteChunkFile.write('%s|%s|%s|%s|%s|%s|%s\n' \ % (noteKey, 1, ikmcNotes, createdByKey, createdByKey, loaddate, loaddate)) useIKMCnotekey = noteKey noteKey = noteKey + 1 # Print out a new text file and attach the new MGI Allele IDs as the last field if createdBy == 'ikmc_alleleload': newAlleleFile.write('%s\t%s%s\t%s\n' \ % (mgi_utils.prvalue(ikmcNotes), \ mgi_utils.prvalue(mgiPrefix), mgi_utils.prvalue(mgiKey), \ mgi_utils.prvalue(ikmcSymbol))) else: newAlleleFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s\n' \ % (mgi_utils.prvalue(markerID), \ mgi_utils.prvalue(symbol), \ mgi_utils.prvalue(name), \ mgi_utils.prvalue(alleleStatus), \ mgi_utils.prvalue(alleleType), \ mgi_utils.prvalue(alleleSubtype), \ mgi_utils.prvalue(collection), \ mgi_utils.prvalue(germLine), \ mgi_utils.prvalue(references), \ mgi_utils.prvalue(strainOfOrigin), \ mgi_utils.prvalue(mutantCellLine), \ mgi_utils.prvalue(allMutations), \ mgi_utils.prvalue(inheritanceMode), \ mgi_utils.prvalue(isMixed), \ mgi_utils.prvalue(isExtinct), \ mgi_utils.prvalue(refKey), \ mgi_utils.prvalue(markerStatusKey), \ mgi_utils.prvalue(createdBy), \ mgi_utils.prvalue(mgiPrefix), mgi_utils.prvalue(mgiKey))) # save symbol/alleleKey/ikmc note key alleleLookup[symbol] = [] alleleLookup[symbol].append((alleleKey, useIKMCnotekey, mgiPrefix + str(mgiKey))) accKey = accKey + 1 mgiKey = mgiKey + 1 alleleKey = alleleKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax(%d)' % (lineNum), None) db.commit()
''', 'auto') s = '' count = 0 for r in results: stage = r['stage'] age = r['age'] m = re.search('[0-9]',age) # if age has no numeric specified, print it out; probable error if m == None: s = s + r['mgi'] + TAB + r['jnum'] + TAB + mgi_utils.prvalue(r['label']) + CRT count = count + 1 continue start = m.start() range = age[start:] # parse by range "-" or list "," m = re.search('[-,]', range) if m == None: minAge = string.atof(range) maxAge = minAge else: delim = m.start()
def processGelLaneFile(): global assayGelLane, gelLaneKey lineNum = 0 # For each line in the input file for line in inGelLaneFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: assayID = tokens[0] laneID = tokens[1] laneLabel = tokens[2] genotypeID = tokens[3] rnaType = tokens[4] control = tokens[5] sampleAmount = tokens[6] gender = tokens[7] age = tokens[8] ageNote = tokens[9] laneNote = tokens[10] emapaID = tokens[11] structureTS = tokens[12] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) # if control is set to "No", then there *is* a structure # else there are no structures hasStructure = 0 if control == "No": hasStructure = 1 genotypeKey = gxdloadlib.verifyGenotype(genotypeID, lineNum, errorFile) rnaTypeKey = gxdloadlib.verifyGelRNAType(rnaType, lineNum, errorFile) controlKey = gxdloadlib.verifyGelControl(control, lineNum, errorFile) ageMin, ageMax = agelib.ageMinMax(age) if hasStructure: structureKey = gxdloadlib.verifyTerm(emapaID, 90, '', lineNum, errorFile) if structureKey == 0: error = 1 # # if age = "Not Specified", then ageMin/ageMax = -1 which is < 0 # so, removed this check: # ageMin < 0 or ageMax < 0: # if genotypeKey == 0 or rnaTypeKey == 0 or controlKey == 0: # set error flag to true error = 1 # if errors, continue to next record if error: continue # if no errors, process key = '%s:%s' % (assayID, laneID) # if this is a lane that has not been added to the gel lane yet... if not assayGelLane.has_key(key): outGelLaneFile.write( str(gelLaneKey) + TAB + \ str(assayAssay[assayID]) + TAB + \ str(genotypeKey) + TAB + \ str(rnaTypeKey) + TAB + \ str(controlKey) + TAB + \ str(laneID) + TAB + \ laneLabel + TAB + \ mgi_utils.prvalue(sampleAmount) + TAB + \ gender + TAB + \ age + TAB + \ str(ageMin) + TAB + \ str(ageMax) + TAB + \ mgi_utils.prvalue(ageNote) + TAB + \ mgi_utils.prvalue(laneNote) + TAB + \ loaddate + TAB + loaddate + CRT) if hasStructure: outGelLaneStFile.write( str(gelLaneKey) + TAB + \ str(structureKey) + TAB + \ loaddate + TAB + loaddate + CRT) assayGelLane[key] = gelLaneKey gelLaneKey = gelLaneKey + 1 # else if gel lanes has more than one structure... else: if hasStructure: outGelLaneStFile.write( str(assayGelLane[key]) + TAB + \ str(structureKey) + TAB + \ loaddate + TAB + loaddate + CRT) # end of "for line in inGelLaneFile.readlines():" #print assayGelLane return
and a1._LogicalDB_key = 1 and a1.prefixPart = 'MGI:' and a1.preferred = 1 and p._Marker_key = a2._Object_key and a2._MGIType_key = 2 and a2._LogicalDB_key = 1 and a2.prefixPart = 'MGI:' and a2.preferred = 1 order by p.symbol ''', 'auto') for r in results: mname = r['mname'] pname = r['pname'] p1seq = r['primer1sequence'] p2seq = r['primer2sequence'] prodSize = r['productSize'] fp.write(r['symbol'] + TAB + mname + TAB + pname + TAB + r['markerID'] + TAB + r['probeID'] + TAB + mgi_utils.prvalue(p1seq) + TAB + mgi_utils.prvalue(p2seq) + TAB + mgi_utils.prvalue(prodSize) + TAB + r['chromosome'] + TAB + str(r['cmoffset']) + CRT) reportlib.finish_nonps(fp)
def processGelBandFile(): global gelRowKey, gelBandKey lineNum = 0 prevAssay = 0 prevLane = 0 prevRow = 0 # For each line in the input file for line in inGelBandFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: assayID = tokens[0] laneID = tokens[1] rowID = tokens[2] bandSize = tokens[3] bandUnits = tokens[4] bandStrength = tokens[5] rowNote = tokens[6] bandNote = tokens[7] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) unitsKey = gxdloadlib.verifyGelUnits(bandUnits, lineNum, errorFile) strengthKey = gxdloadlib.verifyGelStrength(bandStrength, lineNum, errorFile) if unitsKey == 0 or strengthKey == 0: # set error flag to true error = 1 # if errors, continue to next record if error: continue # new Assay means new Row if prevAssay != assayID: gelRowKey = gelRowKey + 1 outGelRowFile.write( str(gelRowKey) + TAB + \ str(assayAssay[assayID]) + TAB + \ str(unitsKey) + TAB + \ str(rowID) + TAB + \ mgi_utils.prvalue(bandSize) + TAB + \ mgi_utils.prvalue(rowNote) + TAB + \ loaddate + TAB + loaddate + CRT) prevAssay = assayID # determine the lane key based on assayID and laneID key = '%s:%s' % (assayID, laneID) laneKey = assayGelLane[key] outGelBandFile.write( str(gelBandKey) + TAB + \ str(laneKey) + TAB + \ str(gelRowKey) + TAB + \ str(strengthKey) + TAB + \ mgi_utils.prvalue(bandNote) + TAB + \ loaddate + TAB + loaddate + CRT) gelBandKey = gelBandKey + 1 # end of "for line in inGelLaneFile.readlines():" return
def processFile(): global probeKey, refKey, aliasKey, accKey, mgiKey lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: name = tokens[0] jnum = tokens[1] parentID = tokens[2] sourceName = tokens[3] organism = tokens[4] strain = tokens[5] tissue = tokens[6] gender = tokens[7] cellLine = tokens[8] age = tokens[9] vectorType = tokens[10] segmentType = tokens[11] regionCovered = tokens[12] insertSite = tokens[13] insertSize = tokens[14] markerIDs = string.split(tokens[15], '|') relationship = tokens[16] sequenceIDs = tokens[17] aliasList = string.split(tokens[18], '|') notes = tokens[19] rawnotes = tokens[20] createdBy = tokens[21] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) isParent = 0 isSource = 0 parentProbeKey = '' sourceKey = 0 if parentID != '': isParent = 1 if sourceName != '': isSource = 1 if not isParent and not isSource: organismKey = sourceloadlib.verifyOrganism(organism, lineNum, errorFile) strainKey = sourceloadlib.verifyStrain(strain, lineNum, errorFile) tissueKey = sourceloadlib.verifyTissue(tissue, lineNum, errorFile) genderKey = sourceloadlib.verifyGender(gender, lineNum, errorFile) cellLineKey = sourceloadlib.verifyCellLine(cellLine, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifySource(segmentTypeKey, \ vectorKey, organismKey, strainKey, \ tissueKey, genderKey, cellLineKey, age, lineNum, errorFile) if organismKey == 0 or strainKey == 0 or tissueKey == 0 or \ genderKey == 0 or cellLineKey == 0 or vectorKey == 0 or \ segmentTypeKey == 0 or sourceKey == 0: errorFile.write('%s, %s, %s, %s, %s, %s, %s, %s\n' % (segmentType, vectorType, organism, strain, tissue, gender, cellLine, age)) error = 1 elif not isParent and isSource: vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifyLibrary(sourceName, lineNum, errorFile) if vectorKey == 0 or segmentTypeKey == 0 or sourceKey == 0: error = 1 # parent from = yes, source given = yes or no (ignored) else: parentProbeKey, sourceKey = verifyParentProbe( parentID, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) if parentProbeKey == 0 or sourceKey == 0 or vectorKey == 0 or segmentTypeKey == 0: error = 1 referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile) createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) if referenceKey == 0: errorFile.write('Invalid Reference: %s\n' % (jnum)) error = 1 if createdByKey == 0: errorFile.write('Invalid Creator: %s\n\n' % (createdBy)) error = 1 # marker IDs markerList = [] for markerID in markerIDs: markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) if len(markerID) > 0 and markerKey == 0: errorFile.write('Invalid Marker: %s, %s\n' % (name, markerID)) error = 1 elif len(markerID) > 0: markerList.append(markerKey) # sequence IDs seqAccDict = {} for seqID in string.split(sequenceIDs, '|'): if len(seqID) > 0: [logicalDB, acc] = string.split(seqID, ':') logicalDBKey = loadlib.verifyLogicalDB(logicalDB, lineNum, errorFile) if logicalDBKey > 0: seqAccDict[acc] = logicalDBKey # if errors, continue to next record if error: continue # if no errors, process the probe probeFile.write('%d\t%s\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t\t%s\t%s\t%s\t%s\n' \ % (probeKey, name, parentProbeKey, sourceKey, vectorKey, segmentTypeKey, mgi_utils.prvalue(regionCovered), \ mgi_utils.prvalue(insertSite), mgi_utils.prvalue(insertSize), createdByKey, createdByKey, loaddate, loaddate)) for markerKey in markerList: if markerList.count(markerKey) == 1: markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \ % (probeKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate)) else: errorFile.write('Invalid Marker Duplicate: %s, %s\n' % (name, markerID)) refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' \ % (refKey, probeKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) # aliases for alias in aliasList: if len(alias) == 0: continue aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate)) aliasKey = aliasKey + 1 # MGI Accession ID for the marker accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) # Print out a new text file and attach the new MGI Probe IDs as the last field newProbeFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \ % (name, jnum, \ mgi_utils.prvalue(sourceName), \ organism, \ mgi_utils.prvalue(strain), \ mgi_utils.prvalue(tissue), \ mgi_utils.prvalue(gender), \ mgi_utils.prvalue(cellLine), \ mgi_utils.prvalue(age), \ mgi_utils.prvalue(vectorType), \ mgi_utils.prvalue(segmentType), \ mgi_utils.prvalue(regionCovered) + \ mgi_utils.prvalue(insertSite), \ mgi_utils.prvalue(insertSize), \ string.join(markerIDs, '|'), \ relationship, \ mgi_utils.prvalue(sequenceIDs), \ string.join(aliasList, '|'), \ mgi_utils.prvalue(notes), \ createdBy, mgiPrefix, mgiKey)) # Print out a raw note file if len(rawnotes) > 0: rawNoteFile.write('%s%d\t%s\n' % (mgiPrefix, mgiKey, rawnotes)) # Notes if len(notes) > 0: noteFile.write('%s\t%s\t%s\t%s\n' % (probeKey, notes, loaddate, loaddate)) accKey = accKey + 1 mgiKey = mgiKey + 1 # sequence accession ids for acc in seqAccDict.keys(): prefixPart, numericPart = accessionlib.split_accnum(acc) accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, acc, prefixPart, numericPart, seqAccDict[acc], probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) accKey = accKey + 1 refKey = refKey + 1 probeKey = probeKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax (%d)' % (lineNum), None)