Python IOmodule.pickleOut примеры использования

Язык программирования: Python

Класс/Тип: IOmodule

Метод/Функция: pickleOut

Примеров на hotexamples.com: 6

Python IOmodule.pickleOut - 6 примеров найдено. Это лучшие примеры Python кода для IOmodule.pickleOut, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getSortedFilelist(13)

readInTDM(9)

writeOutTDM(9)

pickleIn(6)

pickleOut(6)

writeOutTxt(3)

read_2col(2)

Пример #1

Показать файл

Файл: TermDoc.py Проект: hmbachelor/bachelor

def createDiseaseLabelHash():

    """
    Create and save a hash that connects every PMID with one or more diseases.
    """

    t1 = time.time()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

    labelHash={}

    fileCount=0
    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        colMatrix=subMatrix.tocsc()

        pmids=colMatrix.getcol(0)[1:].data

        for pmid in pmids:
            try:
                labelHash[pmid].append(file[:-4])
            except:
                labelHash[pmid]=[]
                labelHash[pmid].append(file[:-4])
            
        fileCount+=1
        print "Remaining:",(len(files)-fileCount),"Completed",file[:-4]

    t2 = time.time()

    print 'Created disease label hash in:',str(t2-t1)

    IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)

Пример #2

Показать файл

Файл: testModule.py Проект: hmbachelor/bachelor

def createVLHash(M_lil):

    VLHash={}
    for pmidHash in range(M_lil.shape[0]):
        VLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])

    IOmodule.pickleOut("/root/The_Hive/term_doc/hashTables", "VLHash","btd", VLHash)

Пример #3

Показать файл

Файл: SearchTermDoc.py Проект: hmbachelor/bachelor

def createCLHash(M_coo,filename,save_file=True):

    """
    Precompute and save the length of each column vector in the term-doc matrix.
    Here the length refers to the number of elements.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    M_lil=(M_coo.transpose()).tolil()

    CLHash={}
    count=0
    for termHash in range(1,M_coo.shape[1]):
        termVectorLength=len((M_lil.getrow(termHash).nonzero()[0])[1:])
        CLHash[termHash]=termVectorLength
        count+=1
        if save_file: print "Hashes created: "+str(count)+". Length:"+str(termVectorLength)

    t2=time.time()    

    if save_file:
        print "Created and saved ColumnLength-hash in: "+str(t2-t1)
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", CLHash)
    else:
        return CLHash

Пример #4

Показать файл

Файл: SearchTermDoc.py Проект: hmbachelor/bachelor

def createRLHash(M_lil,filename,save_file=True):

    """
    Precompute and save the norm of each row vector in the term-doc matrix.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    RLHash={}
    count=0
    for pmidHash in range(1,M_lil.shape[0]):
        RLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])
        count+=1
        if save_file: print "Hashes created: "+str(count)

    if save_file:
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", RLHash)
    else:
        return RLHash

    t2=time.time()
    print "Created and saved RowLength-hash in: "+str(t2-t1)

Пример #5

Показать файл

Файл: DiseaseMatrix.py Проект: hmbachelor/bachelor

def createDiseaseHash(dir,output=False):

    """
    Recieves a directory containing files to be hashed. It uses the
    filename as a key. It requires the files to be in .mtx format. The
    hashes starts from 1 ... number_of_files
    """

    diseaseHashes={}

    files = IO.getSortedFilelist(dir)
    counter=0
    for f in files:
        diseaseName=f[0:f.find('.mtx')]
        stdm=IO.readInTDM(dir,f)
        if stdm.shape[0]==1:
            continue
        if diseaseName not in diseaseHashes.keys():
            counter+=1
            if output:
                print 'Created', diseaseName, 'with hash', counter
            diseaseHashes[diseaseName]=counter

    IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)

Пример #6

Показать файл

Файл: TermDoc.py Проект: hmbachelor/bachelor

def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable