예제 #1
0
파일: index.py 프로젝트: hellertime/leif
	def compressTermIdData(self,termId):
		return data.compressDocIdTermInstanceTable(self.termIdHash[termId])
예제 #2
0
파일: index.py 프로젝트: hellertime/leif
	def mergePartitions(self,termIdList,*partitions):
		"""Merge the data from partitions into self
		This must be a method on an ExternalPartition, MemoryPartitions have no concept of merging
		termIdList must contain the sorted list of all termIds in all partitions
		the resulting merged partition will contain one entry for each termId
		"""
		# seek constants... no need to import them from wherever...
		SEEK_END = 2
		SEEK_CUR = 1
		# internal functions to handle some preliminaries
		# makes the main merge code easier to understand, sacrificing overval function length
		def _growPartitionFile(howMuch):
			"""extend the disk partition by howMuch"""
			if os.path.exists(self.path): # prevent trucating existing file
				print >> sys.stderr, "Extending ExternalPartition %s" % self.path
				fp = open(self.path,"rb+")
			else:
				print >> sys.stderr, "Creating ExternalPartiton %s" % self.path
				fp = open(self.path,"wb")

			fp.seek(0,SEEK_END)
			previousSize = fp.tell()
			fp.seek(howMuch - 1,SEEK_CUR)
			fp.write('\x00')
			newSize = fp.tell()
			print >> sys.stderr, "ExternalPartition grew %d bytes, has size %s" % (newSize-previousSize,newSize)
			fp.close()
			self.__mmap_init__()
		def _relocateDocIdTermInstanceTables():
			"""Relocates the existing tables to the end of the file maintaining proper offsets"""
			rp = open(self.path,"rb")
			wp = open(self.path,"rb+")
			wp.seek(0,SEEK_END)

			for termId in reversed(sorted(self.termIdHash)):
				header = self.termIdHash[termId]
				rp.seek(header.offset)
				wp.seek(-header.length,SEEK_CUR)
				newOffset = wp.tell()
				wp.write(rp.read(header.length))
				wp.seek(newOffset)
				self.termIdHash[termId].offset = newOffset

			rp.close()
			wp.close()
		# Main Merge Logic
		spaceNeeded = sum(map(lambda partition: partition.estimateSizeOnDisk(),partitions))
		_growPartitionFile(spaceNeeded)
		_relocateDocIdTermInstanceTables()
		wp = open(self.path,"rb+")

		for termId in termIdList:
			partitionsHoldingTermId = list()
			for partition in partitions:
				if termId in partition:
					partitionsHoldingTermId.append(partition)

			# always add self last
			if termId in self: partitionsHoldingTermId.append(self)
			if len(partitionsHoldingTermId) == 0: continue
			else:
				newOffset = wp.tell()

			if len(partitionsHoldingTermId) == 1:
				partition = partitionsHoldingTermId[0]
				#print >> sys.stderr, "Merge single instance of termId %d from %s" % (termId,partition.name)
				header,compressedData = partition.compressTermIdData(termId)
				wp.write(compressedData)
				header.offset = newOffset
				self.termIdHash[termId] = header

				if partition is not self: partition.deleteTermId(termId)
			else:
				table = data.DocIdTermInstanceTable()
				for partition in partitionsHoldingTermId:
					#print >> sys.stderr, "Merge multi instance termId %s from %s" % (termId,partition.name)
					for docIdTermInstanceVector in partition.lookupTermId(termId):
						docId = docIdTermInstanceVector.docId
						for termInstance in docIdTermInstanceVector.termInstancesGenerator:
							table.insertTermInstanceRecord(docId,termInstance)

					if partition is not self: partition.deleteTermId(termId)

				header,compressedData = data.compressDocIdTermInstanceTable(table) 
				wp.write(compressedData)
				header.offset = newOffset
				self.termIdHash[termId] = header

		wp.truncate()
		print >> sys.stderr, "ExternalPartition was truncated to size %d" % wp.tell()
		wp.close()
		self.__mmap_init__()