def _gatherMatrixData(filename): """ This function utilizes the RecordHandler module to create and structure the data to populate the term-doc matrices. It currently also removes stopwords from the abstract. It takes the records' file name to gather data from. It returns a doc-term list on the form: [[PMID,[(term1,count1),...],...] """ medlineDir=_medlineDir # Get the regex pattern that sanitize strings. sanitizer = sanitizeString() l = [] records = RecordHandler.loadMedlineRecords(medlineDir, filename) fields = RecordHandler.readMedlineFields(records, ['AB','TI','MH']) for entry in fields.items(): information='' # Get the title if any try: information=' '+entry[1]['TI'] except: print 'Unable to find title in', entry[0] # Get the abstract if any try: information+=' '+entry[1]['AB'] except: print 'Unable to find abstract in', entry[0] # Get all the mesh terms if any if 'MH' in entry[1]: for meshterm in entry[1]['MH']: information+=' '+meshterm # Sanitize the abstract information=sanitizer.sub(' ', information) # Remove english stopwords from the information information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the information if _stemmer: information=FilterInterface.porterStemmer(information) l.append(_wordCounter(entry[0],information)) return l
def testRH(): path="/root/The_Hive/data_acquisition/medline_records" disease="Winkelman Bethge Pfeiffer syndrome.txt" records = RH.loadMedlineRecords(path,disease) fields = RH.readMedlineFields(records,['TI','MH', 'AB']) l = [] for entry in fields.items(): # Get the abstract try: information=entry[1]['TI'] except: print 'Unable to find title in', entry[0] continue try: information+=entry[1]['AB'] except: print 'Unable to find abstract in', entry[0] continue try: for meshterm in entry[1]['MH']: information+=' '+meshterm except: print 'Unable to find MeSH in', entry[0] continue # MESH GOES HERE # Sanitize the abstract # abstract=sanitizer.sub(' ', abstract) # Remove english stopwords from the abstract # abstract=FilterInterface.stopwordRemover(abstract) # OPTIONAL: # Stem the abstract # if _stemmer: abstract=FilterInterface.porterStemmer(abstract) l.append((entry[0],information)) return l
def createTermAndPmidHashes(): """ This function creates two hash tables of the PMID's and terms to be used for the term-doc matrix. Note that the terms are sanitized for any non-alphanumerical characters. And it is default to remove stop words. """ medlineDir = _medlineDir hashTables = _hashTablesDir termHashTable={} pmidHashTable={} termCounter = 0 pmidCounter = 0 files = IOmodule.getSortedFilelist(medlineDir+'/') # files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)]) # Get the regex pattern that sanitizeses strings. sanitizer = TextCleaner.sanitizeString() for file in files: records = RecordHandler.loadMedlineRecords(medlineDir, file) # *Note* # Parts of the following loops could be optimized by using dictionaries # for direct loopkups instead of linear lookups, but since it's not # important, optimization will have to wait for another day. # Hash PMID's for diseaseRecords in records.values(): for record in diseaseRecords: pmid=record[0] if pmid not in pmidHashTable: pmidCounter+=1 pmidHashTable[pmid]=pmidCounter information='' # Get the abstract try: information=' '+record[1]['AB'] except: print 'Unable to get abstract', record[0] try: information+=' '+record[1]['TI'] except: print 'Unable to get title for', record[0] if 'MH' in record[1]: for meshterm in record[1]['MH']: information+=' '+meshterm # We do not want to print this, as most of the # records do not have MeSH. # print 'Unable to get MeSH terms for', record[0] # Sanitize the information information=sanitizer.sub(' ', information) # remove stopwords from the abstract information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the abstract if _stemmer: information=FilterInterface.porterStemmer(information) termList = [word for word in information.split(' ') if word != ''] for term in termList: if term not in termHashTable: termCounter+=1 termHashTable[term]=termCounter else: continue print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed." IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable) IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable) return termHashTable, pmidHashTable