def createDiseaseLabelHash(): """ Create and save a hash that connects every PMID with one or more diseases. """ t1 = time.time() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') labelHash={} fileCount=0 for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) colMatrix=subMatrix.tocsc() pmids=colMatrix.getcol(0)[1:].data for pmid in pmids: try: labelHash[pmid].append(file[:-4]) except: labelHash[pmid]=[] labelHash[pmid].append(file[:-4]) fileCount+=1 print "Remaining:",(len(files)-fileCount),"Completed",file[:-4] t2 = time.time() print 'Created disease label hash in:',str(t2-t1) IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
def createVLHash(M_lil): VLHash={} for pmidHash in range(M_lil.shape[0]): VLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:]) IOmodule.pickleOut("/root/The_Hive/term_doc/hashTables", "VLHash","btd", VLHash)
def createCLHash(M_coo,filename,save_file=True): """ Precompute and save the length of each column vector in the term-doc matrix. Here the length refers to the number of elements. """ t1=time.time() if not os.path.isdir(_hashTablesDir): os.mkdir(_hashTablesDir) M_lil=(M_coo.transpose()).tolil() CLHash={} count=0 for termHash in range(1,M_coo.shape[1]): termVectorLength=len((M_lil.getrow(termHash).nonzero()[0])[1:]) CLHash[termHash]=termVectorLength count+=1 if save_file: print "Hashes created: "+str(count)+". Length:"+str(termVectorLength) t2=time.time() if save_file: print "Created and saved ColumnLength-hash in: "+str(t2-t1) IOmodule.pickleOut(_hashTablesDir, filename,"btd", CLHash) else: return CLHash
def createRLHash(M_lil,filename,save_file=True): """ Precompute and save the norm of each row vector in the term-doc matrix. """ t1=time.time() if not os.path.isdir(_hashTablesDir): os.mkdir(_hashTablesDir) RLHash={} count=0 for pmidHash in range(1,M_lil.shape[0]): RLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:]) count+=1 if save_file: print "Hashes created: "+str(count) if save_file: IOmodule.pickleOut(_hashTablesDir, filename,"btd", RLHash) else: return RLHash t2=time.time() print "Created and saved RowLength-hash in: "+str(t2-t1)
def createDiseaseHash(dir,output=False): """ Recieves a directory containing files to be hashed. It uses the filename as a key. It requires the files to be in .mtx format. The hashes starts from 1 ... number_of_files """ diseaseHashes={} files = IO.getSortedFilelist(dir) counter=0 for f in files: diseaseName=f[0:f.find('.mtx')] stdm=IO.readInTDM(dir,f) if stdm.shape[0]==1: continue if diseaseName not in diseaseHashes.keys(): counter+=1 if output: print 'Created', diseaseName, 'with hash', counter diseaseHashes[diseaseName]=counter IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
def createTermAndPmidHashes(): """ This function creates two hash tables of the PMID's and terms to be used for the term-doc matrix. Note that the terms are sanitized for any non-alphanumerical characters. And it is default to remove stop words. """ medlineDir = _medlineDir hashTables = _hashTablesDir termHashTable={} pmidHashTable={} termCounter = 0 pmidCounter = 0 files = IOmodule.getSortedFilelist(medlineDir+'/') # files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)]) # Get the regex pattern that sanitizeses strings. sanitizer = TextCleaner.sanitizeString() for file in files: records = RecordHandler.loadMedlineRecords(medlineDir, file) # *Note* # Parts of the following loops could be optimized by using dictionaries # for direct loopkups instead of linear lookups, but since it's not # important, optimization will have to wait for another day. # Hash PMID's for diseaseRecords in records.values(): for record in diseaseRecords: pmid=record[0] if pmid not in pmidHashTable: pmidCounter+=1 pmidHashTable[pmid]=pmidCounter information='' # Get the abstract try: information=' '+record[1]['AB'] except: print 'Unable to get abstract', record[0] try: information+=' '+record[1]['TI'] except: print 'Unable to get title for', record[0] if 'MH' in record[1]: for meshterm in record[1]['MH']: information+=' '+meshterm # We do not want to print this, as most of the # records do not have MeSH. # print 'Unable to get MeSH terms for', record[0] # Sanitize the information information=sanitizer.sub(' ', information) # remove stopwords from the abstract information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the abstract if _stemmer: information=FilterInterface.porterStemmer(information) termList = [word for word in information.split(' ') if word != ''] for term in termList: if term not in termHashTable: termCounter+=1 termHashTable[term]=termCounter else: continue print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed." IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable) IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable) return termHashTable, pmidHashTable