def sumMeasure(M_lil, M_csc, queryString): """ """ ### TEMP ### # (Lav precomputed RL-hash for labelmatrices hvis noedvendigt) #vectorLength=SearchTermDoc.createRLHash(M_lil, None,False) ############ # Extract the relevant indices of the row-vectors (pmid-hashes) searchIndices,hashedSearchTerms = SearchTermDoc.extractRowIndices(M_csc, queryString) # Union the arrays to avoid searching each row more than once searchIndices = reduce(set.union,map(set,searchIndices)) results=[] for pmidHash in searchIndices: Sum=0 for termHash in hashedSearchTerms: Sum+=M_lil[pmidHash,termHash] #/ vectorLength[pmidHash] results.append((Sum,pmidHash)) return results
def _normalizeVectorLengths(M_lil): """ Normalize the length of a sparse lil_matrix. """ t1=time.time() # Create a norm-hash of each row-vector in the stemmed term-doc matrix. vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False) for row in range(1,M_lil.shape[0]): norm=vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_lil[row,col]=(M_lil[row,col])/norm print "Normalized:",row t2=time.time() print "Total:"+str(t2-t1) # This is madness tfidfMatrix = M_lil # Save and overwrite the log_tfidf generated above IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
def analyseDiseaseTerms(M_coo): listOfDiseases=["Adrenoleukodystrophy autosomal neonatal form","Kleine Levin Syndrome"] listOfSymptoms=["Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions", "Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"] sanitizer = TextCleaner.sanitizeString() M_lil=M_coo.tolil() count=0 for disease in listOfDiseases: rowIndex=_diseaseHash[disease] termIndices=M_lil.getrow(rowIndex).nonzero()[1][1:] termList=[] for colIndex in termIndices: termList.append((M_lil[rowIndex,colIndex],revTermHashTable[colIndex])) termList.sort() termList.reverse() printout1=[] #for item in termList[:20] # printout1.append(item[1]) count=0 newTermList=[] for item in termList: if len(item[1])>7: newTermList.append(item) for item in newTermList[:20]: printout1.append(item[1]) print 'Top 20 terms:' print '---------------------' print printout1 print "=====================" printout2=[] symptoms=listOfSymptoms[count] symptoms = sanitizer.sub(' ', symptoms) symptoms = FilterInterface.stopwordRemover(symptoms) symptoms=FilterInterface.porterStemmer(symptoms) symptoms=SearchTermDoc._modifySearchString(symptoms) count+=1 for symptom in symptoms: for term in termList: if term[1]==symptom: printout2.append((termList.index(term),symptom)) print 'Ranks of searched symptoms:' print '---------------------' print printout2 print "=====================" print ''
def getSemanticKeywords(top=20): diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"), ("Adrenoleukodystrophy autosomal neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"), ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"), ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"), ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")] matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5" scoreDic={} totalTermDic={} for disease in diseaseList: filename=disease[0] symptoms=disease[1] symptoms=SearchTermDoc._modifySearchString(symptoms) symptoms=map(FilterInterface.porterStemmer,symptoms) M_coo=IOmodule.readInTDM(matrixDir,filename) totalTermDic[filename]=(M_coo.shape[1]-1) M_csc=M_coo.tocsc() termSum=[] for col in range(1,M_coo.shape[1]): term=revTermHashTable[M_csc[0,col]] termSum.append((sum(M_csc.getcol(col).data[:-1]),term)) termSum.sort() termSum.reverse() scoreDic[filename]={} for item in termSum: if item[1] in symptoms: scoreDic[filename][item[1]]=termSum.index(item) #return termSum[:top] for score in scoreDic.items(): print "Total number of terms for the disease:",totalTermDic[score[0]] print str(score[0])+'\t'+str(score[1].items()) print ''
def _normalizeVectorLengths(M_dense, M_lil, filename): """ Normalize the length of a sparse matrix, represented as a dense and a lil - format. """ vectorLength = SearchTermDoc.createRLHash(M_lil, None, False) for row in range(1, M_lil.shape[0]): norm = vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_dense[row, col] = (M_dense[row, col]) / norm tfidfMatrix = sparse.coo_matrix(M_dense) # Save the matrix IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)
def _generateLogTFIDF(M_coo): """ Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix, using log-transformation on TF and IDF. Returns a sparse dense- and lil-matrix to be used for vector-normalization. """ termSum = SearchTermDoc.createCLHash(M_coo, None, False) numberOfDocs = float(M_coo.shape[0] - 1) # Use a lil-matrix for nonzero row lookups M_lil = M_coo.tolil() # Use a dense-matrix for constant-time lookups M_dense = M_coo.todense() for row in range(1, numberOfDocs + 1): for col in (M_lil.getrow(row).nonzero()[1])[1:]: # Term frequency tf = M_dense[row, col] if tf == 0: print "Looked up zero-value at: " + str(docIndex) + " " + str(termVectorIndex) raise Exception # Log-transformation of the term frequency tf = math.log(1 + tf) # Inverse-document frequency idf = math.log(numberOfDocs / termSum[col]) M_dense[row, col] = tf * idf return M_dense, M_lil
def cosineMeasure(M_lil, M_csc, queryString): """ This function calculates the square-root of the cosine score for each document containing one or more of the query-terms in the query string (thereby the implicit 'or' between each query-term). It returns a scored list of all the documents mentioned above. """ # Extract the relevant indices of the row-vectors (pmid-hashes) searchIndices,hashedSearchTerms = SearchTermDoc.extractRowIndices(M_csc, queryString) # Union the arrays to avoid searching each row more than once searchIndices = reduce(set.union,map(set,searchIndices)) results=[] for pmidHash in searchIndices: Sum=0 for termHash in hashedSearchTerms: Sum+=M_lil[pmidHash,termHash] results.append((Sum,pmidHash)) return results