def divideIntoChunks(self,filenameGenerator): # input : a file with one input/line # output : chunks with one word/lines (chunks == give size) stackOfValues = [] counter = 0; for InputfileName in self.files: filePointer = open(InputfileName, "r") for line in filePointer: stackOfValues.append(line) if(sys.getsizeof(stackOfValues)>536870912): # more than 64Mo FileHelper.writeListInFile(filenameGenerator(counter),stackOfValues) counter = counter + 1 filePointer.close() if len(stackOfValues) > 0: # Just in case ... FileHelper.writeListInFile(filenameGenerator(counter),stackOfValues) counter = counter + 1; self.nChunks = counter return
def writeDictio(self,dicFromKeyToListOfValues): # At first, we check if all keys are ascociate to a node file. # If yes, we copy the old node file # Id not, we create an empty node file # Then, we write all values in a chunk. # At the end, we append the chunkname into the correct node file # Finally, all nodes ascociated with keys which are not in the actual chunk (but already seen) are copied. for key, listOfValues in dicFromKeyToListOfValues.iteritems(): # We write the list of value into a chunk (for a given key) self.nDifferentChunks = self.nDifferentChunks+1; chunkFilename = self.chunkNameGenerator(self.nDifferentChunks) FileHelper.writeListInFile(chunkFilename,listOfValues) # We check if the NodeFile has been created before. # If yes : we copy it into a new file # If not : we create a new empty file if self.oldDictFromKeyToNodeFile.has_key(key): # the node file exist nodeFileIdx = self.oldDictFromKeyToNodeFile[key]; self.dictFromKeyToNodeFile[key] = nodeFileIdx; oldNodeFileName = self.oldNodeFileNameGenerator(nodeFileIdx); nodeFileName = self.nodeFileNameGenerator(nodeFileIdx); FileHelper.copyFile(oldNodeFileName,nodeFileName) else: self.nDifferentKeys = self.nDifferentKeys+1; nodeFileIdx = self.nDifferentKeys self.dictFromKeyToNodeFile[key] = nodeFileIdx; nodeFileName = self.nodeFileNameGenerator(nodeFileIdx); open(nodeFileName, 'w+').close(); # create empty file # We append the new chunk name (without the directory) into the node file with open(nodeFileName, 'a') as nodePointer: nodePointer.write(chunkFilename + "\n") for key, listOfValues in self.oldDictFromKeyToNodeFile.iteritems(): if not dicFromKeyToListOfValues.has_key(key): nodeFileIdx = self.oldDictFromKeyToNodeFile[key]; self.dictFromKeyToNodeFile[key] = nodeFileIdx; oldNodeFileName = self.oldNodeFileNameGenerator(nodeFileIdx); nodeFileName = self.nodeFileNameGenerator(nodeFileIdx); FileHelper.copyFile(oldNodeFileName,nodeFileName) return;