Пример #1
0
def sumMeasure(M_lil, M_csc, queryString):

    """

    """

    ### TEMP ###
    # (Lav precomputed RL-hash for labelmatrices hvis noedvendigt)
    #vectorLength=SearchTermDoc.createRLHash(M_lil, None,False)
    ############

    # Extract the relevant indices of the row-vectors (pmid-hashes)
    searchIndices,hashedSearchTerms = SearchTermDoc.extractRowIndices(M_csc, queryString)

    # Union the arrays to avoid searching each row more than once
    searchIndices = reduce(set.union,map(set,searchIndices))

    results=[]
    for pmidHash in searchIndices:
        Sum=0
        for termHash in hashedSearchTerms:
            Sum+=M_lil[pmidHash,termHash] #/ vectorLength[pmidHash]
        results.append((Sum,pmidHash))

    return results
Пример #2
0
def _normalizeVectorLengths(M_lil):

    """
    Normalize the length of a sparse lil_matrix.
    """

    t1=time.time()

    # Create a norm-hash of each row-vector in the stemmed term-doc matrix.
    vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False)

    for row in range(1,M_lil.shape[0]):

        norm=vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_lil[row,col]=(M_lil[row,col])/norm
        print "Normalized:",row
    t2=time.time()
    print "Total:"+str(t2-t1)

    # This is madness
    tfidfMatrix = M_lil

    # Save and overwrite the log_tfidf generated above
    IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
Пример #3
0
def analyseDiseaseTerms(M_coo):

    listOfDiseases=["Adrenoleukodystrophy  autosomal  neonatal form","Kleine Levin Syndrome"]
    listOfSymptoms=["Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions",
                    "Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"]

    sanitizer = TextCleaner.sanitizeString()

    M_lil=M_coo.tolil()

    count=0
    for disease in listOfDiseases:
        rowIndex=_diseaseHash[disease]

        termIndices=M_lil.getrow(rowIndex).nonzero()[1][1:]

        termList=[]
        for colIndex in termIndices:
            termList.append((M_lil[rowIndex,colIndex],revTermHashTable[colIndex]))

        termList.sort()
        termList.reverse()

        printout1=[]
        #for item in termList[:20]
        #    printout1.append(item[1])
        count=0
        newTermList=[]
        for item in termList:
            if len(item[1])>7: newTermList.append(item)
        for item in newTermList[:20]:
            printout1.append(item[1])

        print 'Top 20 terms:'
        print '---------------------'
        print printout1
        print "====================="

        printout2=[]
        symptoms=listOfSymptoms[count]
        symptoms = sanitizer.sub(' ', symptoms)
        symptoms = FilterInterface.stopwordRemover(symptoms)
        symptoms=FilterInterface.porterStemmer(symptoms)
        symptoms=SearchTermDoc._modifySearchString(symptoms)
        count+=1

        for symptom in symptoms:

            for term in termList:
                if term[1]==symptom: printout2.append((termList.index(term),symptom))
        print 'Ranks of searched symptoms:'
        print '---------------------'
        print printout2
        print "====================="
        print ''
Пример #4
0
def getSemanticKeywords(top=20):

    diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"),
                ("Adrenoleukodystrophy  autosomal  neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"),
                ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"),
                ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"),
                ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")]

    matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5"

    scoreDic={}
    totalTermDic={}
    for disease in diseaseList:

        filename=disease[0]
        symptoms=disease[1]

        symptoms=SearchTermDoc._modifySearchString(symptoms)
        symptoms=map(FilterInterface.porterStemmer,symptoms)

        M_coo=IOmodule.readInTDM(matrixDir,filename)
        totalTermDic[filename]=(M_coo.shape[1]-1)

        M_csc=M_coo.tocsc()
    
        termSum=[]
        for col in range(1,M_coo.shape[1]):
            term=revTermHashTable[M_csc[0,col]]
            termSum.append((sum(M_csc.getcol(col).data[:-1]),term))

        termSum.sort()
        termSum.reverse()

        scoreDic[filename]={}
        for item in termSum:
            if item[1] in symptoms:
                scoreDic[filename][item[1]]=termSum.index(item)

    #return termSum[:top]

    for score in scoreDic.items():
        print "Total number of terms for the disease:",totalTermDic[score[0]]
        print str(score[0])+'\t'+str(score[1].items())
        print ''
Пример #5
0
def _normalizeVectorLengths(M_dense, M_lil, filename):

    """
    Normalize the length of a sparse matrix, represented as a dense and a lil -
    format.
    """

    vectorLength = SearchTermDoc.createRLHash(M_lil, None, False)

    for row in range(1, M_lil.shape[0]):

        norm = vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_dense[row, col] = (M_dense[row, col]) / norm

    tfidfMatrix = sparse.coo_matrix(M_dense)

    # Save the matrix
    IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)
Пример #6
0
def _generateLogTFIDF(M_coo):

    """
    Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix,
    using log-transformation on TF and IDF.

    Returns a sparse dense- and lil-matrix to be used for vector-normalization.
    """

    termSum = SearchTermDoc.createCLHash(M_coo, None, False)

    numberOfDocs = float(M_coo.shape[0] - 1)

    # Use a lil-matrix for nonzero row lookups
    M_lil = M_coo.tolil()
    # Use a dense-matrix for constant-time lookups
    M_dense = M_coo.todense()

    for row in range(1, numberOfDocs + 1):

        for col in (M_lil.getrow(row).nonzero()[1])[1:]:

            # Term frequency
            tf = M_dense[row, col]

            if tf == 0:
                print "Looked up zero-value at: " + str(docIndex) + " " + str(termVectorIndex)
                raise Exception
            # Log-transformation of the term frequency
            tf = math.log(1 + tf)

            # Inverse-document frequency
            idf = math.log(numberOfDocs / termSum[col])

            M_dense[row, col] = tf * idf

    return M_dense, M_lil
Пример #7
0
def cosineMeasure(M_lil, M_csc, queryString):

    """
    This function calculates the square-root of the cosine score for each
    document containing one or more of the query-terms in the query string
    (thereby the implicit 'or' between each query-term).

    It returns a scored list of all the documents mentioned above.
    """

    # Extract the relevant indices of the row-vectors (pmid-hashes)
    searchIndices,hashedSearchTerms = SearchTermDoc.extractRowIndices(M_csc, queryString)

    # Union the arrays to avoid searching each row more than once
    searchIndices = reduce(set.union,map(set,searchIndices))

    results=[]
    for pmidHash in searchIndices:
        Sum=0
        for termHash in hashedSearchTerms:
            Sum+=M_lil[pmidHash,termHash]
        results.append((Sum,pmidHash))

    return results