def _querySortedHeaderMap(genotypeKeys): """ Fetches the headers for each genotypeKey Returns map of genotypeKey to list of sortedHeaders """ keyMap = {} for batch in batch_list(genotypeKeys, 100): termToHeaderSQL = ''' select vah._object_key, th.term as header, vah.sequencenum from voc_annotheader vah join voc_term th on (th._term_key=vah._term_key) where vah._object_key in (%s) order by vah._object_key, vah.sequencenum ''' % (','.join([str(k) for k in batch])) results, col_defs = performQuery(termToHeaderSQL) for r in results: genotypeKey = r[0] headerTerm = r[1] keyMap.setdefault(genotypeKey, []).append(headerTerm) return keyMap
def loadDirectory(targetDir, repo, subPath=''): """ Recurse targetDir/repo/subPath directory loading all documents within """ path = os.path.join(targetDir, repo, subPath) for filename in os.listdir(path): if filename in IGNORE: continue extension = filename.split('.')[-1] if extension in IGNORE_EXT: continue filePath = os.path.join(path, filename) fileSubPath = filename if subPath: fileSubPath = subPath + "/" + filename if os.path.isdir(filePath): loadDirectory(targetDir, repo, fileSubPath) else: # load document into solr f = open(filePath, 'r') try: document = f.read() finally: f.close() id = repo + "/" + fileSubPath document = document.decode('utf-8', 'ignore') docParts = [l for l in batch_list(document, 2048)] docParts = docParts[:5000] dataDict = [{ 'id': id, 'repo': repo, 'filename': filename, 'filePath': fileSubPath, 'content': docParts }] solrCmd(UPDATE_CMD, data=dataDict)
def _queryGenotypeEdgeMap(genotypeKeys): """ Fetches the edges for each mp term of each genotypeKey Returns map of genotypeKey to map of parent _term_key to child _term_key """ keyMap = {} for batch in batch_list(genotypeKeys, 100): termToEdgesSQL = ''' select child_annot._object_key genotype_key, dc._ancestorobject_key parent_key, dc._descendentobject_key child_key from voc_annot child_annot, voc_annot parent_annot join dag_closure dc on ( _mgitype_key = %d and _ancestorobject_key = parent_annot._term_key ) join voc_term pt on pt._term_key=dc._ancestorobject_key join voc_term ct on ct._term_key=dc._descendentobject_key where child_annot._object_key in (%s) and child_annot._annottype_key = %d and parent_annot._object_key = child_annot._object_key and parent_annot._annottype_key = child_annot._annottype_key and dc._descendentobject_key = child_annot._term_key ''' % (VocTerm._mgitype_key, ','.join([str(k) for k in batch]), Genotype._mp_annottype_key) results, col_defs = performQuery(termToEdgesSQL) for r in results: genotypeKey = r[0] parentKey = r[1] childKey = r[2] keyMap.setdefault(genotypeKey, {}) keyMap[genotypeKey].setdefault(parentKey, []).append(childKey) return keyMap
def _queryTermToHeaderMap(genotypeKeys): """ Fetches the associations of MP header to annotated term for the list of genotypeKeys Returns map of genotypeKey to map of annotTermKey to headerTerm list """ keyMap = {} for batch in batch_list(genotypeKeys, 100): termToHeaderSQL = ''' select distinct va._object_key, va._term_key, th.term as header from dag_closure dc join voc_annotheader vah on (vah._term_key=dc._ancestorobject_key) join voc_term th on (th._term_key=vah._term_key) join voc_annot va on (va._term_key=dc._descendentobject_key and va._object_key=vah._object_key ) where dc._mgitype_key=%d and va._object_key in (%s) ''' % (VocTerm._mgitype_key, ','.join([str(k) for k in batch])) results, col_defs = performQuery(termToHeaderSQL) for r in results: genotypeKey = r[0] annotTermKey = r[1] headerTerm = r[2] keyMap.setdefault(genotypeKey, {}) headerMap = keyMap[genotypeKey] headerMap.setdefault(annotTermKey, []).append(headerTerm) return keyMap