Пример #1
0
def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False):

    files = IO.getSortedFilelist(dir+'/')

    if output:
        counter = 0

    for f in files:
        diseaseName = f[0:f.find('.mtx')]
        subTermDoc = IO.readInTDM(dir, diseaseName)
        if output:
            counter += 1
            print 'Count:', counter

        # If sub term document matrix is empty, just skip it.
        if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1:
            continue

        if time_log:
            t1 = time.time()
        subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log)
        if time_log:
            print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4]

        if output:
            print 'Writing',

        subTermDoc = sparse.coo_matrix(subTermDoc)

        IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
Пример #2
0
def runTFIDF():

    """
    Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix.
    """

    files = IOmodule.getSortedFilelist(_matrixDir + "/")

    #    files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)])

    for file in files:

        file = file[:-4]

        subM_coo = IOmodule.readInTDM(_matrixDir, file)

        t1 = time.time()
        dense, lil = _generateLogTFIDF(subM_coo)
        t2 = time.time()
        print "Generated log_TFIDF in " + str(t2 - t1)

        t1 = time.time()
        _normalizeVectorLengths(dense, lil, file)
        t2 = time.time()
        print "Normalized vector lengths in " + str(t2 - t1)

        print "Done with: " + file + "\n"
Пример #3
0
def createDiseaseLabelHash():

    """
    Create and save a hash that connects every PMID with one or more diseases.
    """

    t1 = time.time()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

    labelHash={}

    fileCount=0
    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        colMatrix=subMatrix.tocsc()

        pmids=colMatrix.getcol(0)[1:].data

        for pmid in pmids:
            try:
                labelHash[pmid].append(file[:-4])
            except:
                labelHash[pmid]=[]
                labelHash[pmid].append(file[:-4])
            
        fileCount+=1
        print "Remaining:",(len(files)-fileCount),"Completed",file[:-4]

    t2 = time.time()

    print 'Created disease label hash in:',str(t2-t1)

    IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
Пример #4
0
def pmidDuplicateCounter(directory, number=None):

        # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])[:number]
        files = IOmodule.getSortedFilelist(directory+'/')
        
        pmidCount={}
        counter=0
        for f in files:
            # Open file descriptor
            fd = open(directory+'/'+f,'r')
            # Read in from file
            diseaseDic=eval(fd.read())
            # Close the file descriptor nicely again
            fd.close()
            
            medlineRecords=diseaseDic['records']

            for record in medlineRecords:
                pmidCount.setdefault(record['PMID'],0)
                pmidCount[record['PMID']]+=1

            counter+=1
            if counter % 1000 == 0:
                print counter
#            print "Files remaining:",(len(files)-counter)
                
        return pmidCount
Пример #5
0
def countRecordfield(directory,field):

    """
    This function counts the number of identical fields in the MedLine records.

    This could for instance be used for a view into how many identical PMIDs
    that have been downloaded on the cross of different disease searches.

    It takes a medline record directory (full path) and a field.

    It returns a dictionary on the form: {PMID: #ids,...}
    """
    
    fieldSum={}
#    files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])
    files = IOmodule.getSortedFilelist(directory+'/')

    counter=0
    for f in files:

        diseaseDic=eval(open(directory+f,'r').read())

        medlineRecords=diseaseDic['records']

        for record in medlineRecords:
            item=record[field]
            fieldSum.setdefault(item,0)
            fieldSum[item]+=1

        counter+=1
        print "Files remaining:",(len(files)-counter)

    return fieldSum
Пример #6
0
def _readDiseases(indexStart=0,indexStop=None):

    """
    Function for returning the content of all or some of the crawled diseases.

    By default all are returned in a dictionary of diseases on the form:
    [{DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}]

    The reason all the dictionaries are returned as a list is to be sure of the
    order.
    """

    path=_subFolder+"/"+diseaseFolder+'/'

    #    files=sorted([f for f in os.listdir(path) if os.path.isfile(path+f)])
    files = IOmodule.getSortedFilelist(path, startIndex=indexStart, stopIndex=indexStop)

    sortedcontents=[]
    for f in files:
        contents={}
        diseaseName=f[0:f.find('.txt')]
        diseaseAttr=eval(open(path+f,'r').read())
        contents[diseaseName]=diseaseAttr
        sortedcontents.append(contents)

    return sortedcontents
Пример #7
0
def createTermDoc(refreshHash=False):

    """
    This function creates a large term-doc martix from a directory of sub term-
    doc matrices.

    It returns a matrix with dimensions given by the specified hash tables.

    It also saves the matrix for later use as a MatrixMarket .mtx file.
    """

    t1 = time.time()

    if refreshHash:
        createTermAndPmidHashes()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

#    files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)])
    
    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)


    # Need to add one due to non zero indexing
    m=len(pmidHashTable)+1
    n=len(termHashTable)+1

    termDoc = sparse.lil_matrix((m,n))

    # Insert values representing hashes
    for i in range(m): termDoc[i,0]=i
    termDoc[0,:]=range(n)

    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        subMCopy=subMatrix.todok()
        for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data):
            m = subMCopy[i,0]
            n = subMCopy[0,j]

            # Make sure not to add index's
            if m==0 or n==0:
                continue

            termDoc[m,n] += v
        print "Added",file

    IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc)

    t2 = time.time()

    print 'Time elapsed:',str(t2-t1)

    return termDoc
Пример #8
0
def countFields(directory, fields):

        # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])

        files = IOmodule.getSortedFilelist(directory+'/')
        
        fieldSum={}
        counter=0
        pmidCounter=0
        emptyCounter=0
        noDescription=0

        for f in files:
            
            fd = open(directory+f,'r')
            
            diseaseDic=eval(fd.read())

            fd.close()
            
            medlineRecords=diseaseDic['records']

            descriptionField=diseaseDic['description']

            if descriptionField== '':
                noDescription+=1

            if medlineRecords == []:
                print "Found empty record"
                emptyCounter+=1

            for record in medlineRecords:
                pmidCounter+=1
                for label in fields:
                    fieldSum.setdefault(label,0)
                    if label in record:
                        fieldSum[label]+=1

            counter+=1
            print "Files remaining:",(len(files)-counter)
                
        return fieldSum,{'pmid count': pmidCounter},{'empty diseases': emptyCounter}
Пример #9
0
def runAndSaveMatrices():

    """
    Transform a directory of matrices to a directory of decomposed matrices.
    """

    files = IOmodule.getSortedFilelist(_oldMatrixDir+'/')

    for file in files:

        M_coo=IOmodule.readInTDM(_oldMatrixDir,file)

        # Make sure the matrix contains information (1-dim. is an empty matrix)
        if M_coo.shape[0]==1:
            continue

        print "Shape:"+str(M_coo.shape)

        # SVD does not run well single dimenstion matrices
        ## (remembering that the first dimension is indices and does not count)
        if M_coo.shape[0]>2:
            M_dense=M_coo.todense()

            # Run SVD
            U,Sig,Vt=_svd(M_dense)

            # Get the reduced semantic space
            S= _semanticSpace(U,Sig,Vt,_reduceBy)

            # Recombine the indices and the reduced matrix
            M_dense[1:,1:]=S.todense()

            # Save the matrix
            M_coo=sparse.coo_matrix(M_dense)

            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
        else:
            print "Dimensionality too low for svd"
            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
Пример #10
0
def medlineDir2MatrixDir():

    """
    This function converts a directory of MedLine records to a new directory of
    corresponding term-doc matrices.

    It takes the matrix dimensions (row: m, col: n).

    It creates a directory (in the home folder) named 'diseaseMatrices' and
    stores the matrices as 'MatrixMarket' .mtx files, named by the disease name.
    """

    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)

    files = IOmodule.getSortedFilelist(_medlineDir+'/')

#    files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)])

    counter = 0
    for file in files:
        data = _gatherMatrixData(file)

        # Get matrix dimensions (+1 for the [0,0] field)
        ## (Here follows a small 0.0001 sec. hack to get n = total number of terms)
        temp={}
        for pmid in data:
            for term in pmid[1]:
                temp[term[0]]=0
        m=len(data)+1
        n=len(temp)+1

        M = _populateMatrix(m, n, data,termHashTable, pmidHashTable)
        diseaseName = file[0:file.find('.txt')]
        IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M)
        counter += 1
        print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
Пример #11
0
def createDiseaseHash(dir,output=False):

    """
    Recieves a directory containing files to be hashed. It uses the
    filename as a key. It requires the files to be in .mtx format. The
    hashes starts from 1 ... number_of_files
    """

    diseaseHashes={}

    files = IO.getSortedFilelist(dir)
    counter=0
    for f in files:
        diseaseName=f[0:f.find('.mtx')]
        stdm=IO.readInTDM(dir,f)
        if stdm.shape[0]==1:
            continue
        if diseaseName not in diseaseHashes.keys():
            counter+=1
            if output:
                print 'Created', diseaseName, 'with hash', counter
            diseaseHashes[diseaseName]=counter

    IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
Пример #12
0
def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable
Пример #13
0
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False):

    """
    Recieves a subMatrixDir goes through all the files and sums up the
    column of it, creating a single row vector containing the sum of
    all column in the sub term doc matrix. It then proceeds to making
    a disease term doc, based on these row vector

    Optional flags are:

    avg, takes the average over the columns of the sub matrices
    instead of the sum.

    output, makes the funtion produce additional output

    time_log, makes the function print out how much time is spend on
    what
    """

    if output:
        print 'Initialising...'

    if time_log:
        t1 = time.time()

    files = IO.getSortedFilelist(subMatrixDir)

    termHashTable = IO.pickleIn(_hashTablesDir, _termHash)
    diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash)

    diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1))

    # Initialize subTermSum to something
    subTermSum = sparse.lil_matrix((1,1))

    if output:
        print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1))
        count = 0

    if time_log:
        print 'Time for initialization:', str(time.time() - t1)[:4]

    for f in files:
        if time_log:
            t2 = time.time()
        diseaseName = f[0:f.find('.mtx')]
        if output:
            print 'Processing', diseaseName
            count+=1
            print 'Numbers remaining', len(files)-count

        subTermDoc = IO.readInTDM(subMatrixDir, diseaseName)
        subTermDoc = subTermDoc.tolil()

        # If the subTermDoc contains nothing, just skip it
        if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1):
            continue
        
        subTermSum = getColumnSum(subTermDoc,avg)
        subTermSum[0,0] = diseaseHashTable[diseaseName]
        subTermSum[0,:] = subTermDoc.getrow(0)

        diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName]
        
        if time_log:
            print 'Time for', diseaseName, str(time.time() - t2)[:4]
            t3 = time.time()

        if output:
            print 'Filling in values in disease matrix for', diseaseName
        for columnIndex in range(1,subTermSum.shape[1]):
            diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex]
        if time_log:
            print 'Values filled into disease matrix in', str(time.time() - t3)[:4]
        if output:
            print 'Completed filling in values.'

    # Hack way of making term hashes
    diseaseMatrix[0,:] = range(0,len(termHashTable))
    
    if output:
        print 'Done making disease matrix, writing to'

    IO.writeOutTDM(_termDocDir, label, diseaseMatrix)

    if output:
        print 'Done writing disease matrix.'
        
    return diseaseMatrix